diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-64/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lora_plus_20251202_001141/checkpoint-64/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/chat_template.jinja b/dapo_lorafa_20251202_173337/checkpoint-576/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/special_tokens_map.json b/dapo_lorafa_20251202_173337/checkpoint-576/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json b/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d4a0dcee8438cb49e7c5f2a024517ecf5b0125c
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json
@@ -0,0 +1,17890 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5298988040478381,
+  "eval_steps": 500,
+  "global_step": 576,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025743793230503798,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.920872470393078e-06,
+      "clip_ratio/high_mean": 1.2302181175982696e-06,
+      "clip_ratio/low_mean": 2.9912232776041492e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1142450779952924e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14377.0,
+      "completions/max_terminated_length": 14377.0,
+      "completions/mean_length": 4861.1796875,
+      "completions/mean_terminated_length": 4861.1796875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 1.0784558206796646,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023554943036288023,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 1437829.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991774559021,
+      "sampling/importance_sampling_ratio/min": 0.00045694064465351403,
+      "sampling/sampling_logp_difference/max": 7.690957069396973,
+      "sampling/sampling_logp_difference/mean": 0.018809247761964798,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 1.673043971095467e-05,
+      "clip_ratio/high_mean": 4.8752071961644106e-06,
+      "clip_ratio/low_mean": 2.1540331545111258e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6415538741275668e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15479.0,
+      "completions/mean_length": 6167.5078125,
+      "completions/mean_terminated_length": 5922.3125,
+      "completions/min_length": 788.0,
+      "completions/min_terminated_length": 788.0,
+      "entropy": 1.1373522356152534,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002558506093919277,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 2245838.0,
+      "reward": 0.296875,
+      "reward_std": 0.2669745087623596,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000486373901367,
+      "sampling/importance_sampling_ratio/min": 2.8637201467063278e-05,
+      "sampling/sampling_logp_difference/max": 10.460803985595703,
+      "sampling/sampling_logp_difference/mean": 0.02123238891363144,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 4.3118818666698644e-05,
+      "clip_ratio/high_mean": 1.0779704666674661e-05,
+      "clip_ratio/low_mean": 3.257358957853285e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.335329458626802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 5691.9296875,
+      "completions/mean_terminated_length": 5435.3203125,
+      "completions/min_length": 535.0,
+      "completions/min_terminated_length": 535.0,
+      "entropy": 1.1964457035064697,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001936351996846497,
+      "learning_rate": 1e-05,
+      "loss": 0.0366,
+      "num_tokens": 2998805.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2727435827255249,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518990516663,
+      "sampling/importance_sampling_ratio/min": 9.316575415141415e-06,
+      "sampling/sampling_logp_difference/max": 11.583715438842773,
+      "sampling/sampling_logp_difference/mean": 0.021076630800962448,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 1.666655725784949e-05,
+      "clip_ratio/high_mean": 4.1666393144623726e-06,
+      "clip_ratio/low_mean": 2.0471738594096678e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4638378022245888e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15148.0,
+      "completions/max_terminated_length": 15148.0,
+      "completions/mean_length": 5535.828125,
+      "completions/mean_terminated_length": 5535.828125,
+      "completions/min_length": 365.0,
+      "completions/min_terminated_length": 365.0,
+      "entropy": 1.0935996025800705,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003226158209145069,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 3727959.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000033378601074,
+      "sampling/importance_sampling_ratio/min": 5.9354013501433656e-05,
+      "sampling/sampling_logp_difference/max": 9.731990814208984,
+      "sampling/sampling_logp_difference/mean": 0.019589610397815704,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 1.9090986825176515e-05,
+      "clip_ratio/high_mean": 4.772746706294129e-06,
+      "clip_ratio/low_mean": 1.995503203033877e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4727778054511873e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14820.0,
+      "completions/mean_length": 4552.9296875,
+      "completions/mean_terminated_length": 4459.771484375,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.9019740223884583,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002911025658249855,
+      "learning_rate": 1e-05,
+      "loss": 0.0742,
+      "num_tokens": 4329342.0,
+      "reward": 0.4375,
+      "reward_std": 0.3448186218738556,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999092817306519,
+      "sampling/importance_sampling_ratio/min": 0.0010333366226404905,
+      "sampling/sampling_logp_difference/max": 6.874962329864502,
+      "sampling/sampling_logp_difference/mean": 0.01768551766872406,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 9.186584293274791e-06,
+      "clip_ratio/high_mean": 2.2966460733186977e-06,
+      "clip_ratio/low_mean": 1.9561108047128073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.185775372254284e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14197.0,
+      "completions/mean_length": 5849.4921875,
+      "completions/mean_terminated_length": 5682.2783203125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.1362405940890312,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018774238415062428,
+      "learning_rate": 1e-05,
+      "loss": 0.0106,
+      "num_tokens": 5097245.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2369818240404129,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999154210090637,
+      "sampling/importance_sampling_ratio/min": 0.00020401047368068248,
+      "sampling/sampling_logp_difference/max": 8.497339248657227,
+      "sampling/sampling_logp_difference/mean": 0.020379718393087387,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 7.997417014848907e-06,
+      "clip_ratio/high_mean": 1.9993542537122266e-06,
+      "clip_ratio/low_mean": 4.003535150332027e-05,
+      "clip_ratio/low_min": 4.32017714047106e-06,
+      "clip_ratio/region_mean": 4.203470598440617e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16045.0,
+      "completions/mean_length": 5744.6796875,
+      "completions/mean_terminated_length": 5575.8017578125,
+      "completions/min_length": 376.0,
+      "completions/min_terminated_length": 376.0,
+      "entropy": 0.989105150103569,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0025437718722969294,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 5851844.0,
+      "reward": 0.375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999915957450867,
+      "sampling/importance_sampling_ratio/min": 4.312803503125906e-05,
+      "sampling/sampling_logp_difference/max": 10.051337242126465,
+      "sampling/sampling_logp_difference/mean": 0.020163267850875854,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 5.422758022177732e-06,
+      "clip_ratio/high_mean": 1.355689505544433e-06,
+      "clip_ratio/low_mean": 3.697482691222831e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.833051641777274e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15874.0,
+      "completions/mean_length": 4075.9609375,
+      "completions/mean_terminated_length": 3979.047119140625,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.8887222409248352,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024127138312906027,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 6392287.0,
+      "reward": 0.4140625,
+      "reward_std": 0.32825323939323425,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999527335166931,
+      "sampling/importance_sampling_ratio/min": 4.007668394478969e-05,
+      "sampling/sampling_logp_difference/max": 10.124715805053711,
+      "sampling/sampling_logp_difference/mean": 0.017202626913785934,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 1.9414138932916103e-05,
+      "clip_ratio/high_mean": 5.8681449672803865e-06,
+      "clip_ratio/low_mean": 4.918625745631289e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.5054402309906436e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15691.0,
+      "completions/mean_length": 5248.3984375,
+      "completions/mean_terminated_length": 4981.14404296875,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.7111036106944084,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0028383845929056406,
+      "learning_rate": 1e-05,
+      "loss": 0.1027,
+      "num_tokens": 7081234.0,
+      "reward": 0.5625,
+      "reward_std": 0.4150439500808716,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589323997498,
+      "sampling/importance_sampling_ratio/min": 0.00037057927693240345,
+      "sampling/sampling_logp_difference/max": 7.900443077087402,
+      "sampling/sampling_logp_difference/mean": 0.01570993661880493,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 7.0035857788752764e-06,
+      "clip_ratio/high_mean": 1.7508964447188191e-06,
+      "clip_ratio/low_mean": 1.4078211620471848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5829108065190667e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16172.0,
+      "completions/max_terminated_length": 16172.0,
+      "completions/mean_length": 4956.6015625,
+      "completions/mean_terminated_length": 4956.6015625,
+      "completions/min_length": 314.0,
+      "completions/min_terminated_length": 314.0,
+      "entropy": 1.026921771466732,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001392067177221179,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 7735511.0,
+      "reward": 0.328125,
+      "reward_std": 0.24777325987815857,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 0.00033587991492822766,
+      "sampling/sampling_logp_difference/max": 7.9987568855285645,
+      "sampling/sampling_logp_difference/mean": 0.019166938960552216,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9272594929352636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9272594929352636e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16229.0,
+      "completions/mean_length": 5858.953125,
+      "completions/mean_terminated_length": 5691.88916015625,
+      "completions/min_length": 356.0,
+      "completions/min_terminated_length": 356.0,
+      "entropy": 1.1407905519008636,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018726681591942906,
+      "learning_rate": 1e-05,
+      "loss": 0.092,
+      "num_tokens": 8506089.0,
+      "reward": 0.25,
+      "reward_std": 0.2829982340335846,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998714327812195,
+      "sampling/importance_sampling_ratio/min": 2.4313605536008254e-05,
+      "sampling/sampling_logp_difference/max": 10.62447452545166,
+      "sampling/sampling_logp_difference/mean": 0.020790230482816696,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 4.318236733524827e-06,
+      "clip_ratio/high_mean": 1.0795591833812068e-06,
+      "clip_ratio/low_mean": 3.3191785689723474e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.427134498679152e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15040.0,
+      "completions/mean_length": 6801.09375,
+      "completions/mean_terminated_length": 6571.1044921875,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 1.185454584658146,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031675526406615973,
+      "learning_rate": 1e-05,
+      "loss": 0.0244,
+      "num_tokens": 9398597.0,
+      "reward": 0.21875,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000109672546387,
+      "sampling/importance_sampling_ratio/min": 0.0010334982071071863,
+      "sampling/sampling_logp_difference/max": 6.874805927276611,
+      "sampling/sampling_logp_difference/mean": 0.021565770730376244,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3892819879401941e-05,
+      "clip_ratio/high_mean": 3.4732049698504852e-06,
+      "clip_ratio/low_mean": 2.9275798283379117e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2749003707976954e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15496.0,
+      "completions/mean_length": 4673.578125,
+      "completions/mean_terminated_length": 4581.3701171875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9907316789031029,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0024632434360682964,
+      "learning_rate": 1e-05,
+      "loss": 0.0147,
+      "num_tokens": 10016559.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000066757202148,
+      "sampling/importance_sampling_ratio/min": 0.001339821144938469,
+      "sampling/sampling_logp_difference/max": 6.6152191162109375,
+      "sampling/sampling_logp_difference/mean": 0.019262395799160004,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.6510958175786072e-05,
+      "clip_ratio/high_mean": 4.127739543946518e-06,
+      "clip_ratio/low_mean": 1.770910688492222e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1836846656242415e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14983.0,
+      "completions/mean_length": 4617.4140625,
+      "completions/mean_terminated_length": 4524.763671875,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "entropy": 1.100720427930355,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032584660220891237,
+      "learning_rate": 1e-05,
+      "loss": 0.0047,
+      "num_tokens": 10628084.0,
+      "reward": 0.375,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999375343322754,
+      "sampling/importance_sampling_ratio/min": 4.245261607138673e-06,
+      "sampling/sampling_logp_difference/max": 12.369707107543945,
+      "sampling/sampling_logp_difference/mean": 0.019928477704524994,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 9.921910532284528e-06,
+      "clip_ratio/high_mean": 3.5021869280171813e-06,
+      "clip_ratio/low_mean": 1.4621458831243217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.81236457592604e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13925.0,
+      "completions/mean_length": 5611.5625,
+      "completions/mean_terminated_length": 5353.0244140625,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 1.0112926587462425,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.001977710286155343,
+      "learning_rate": 1e-05,
+      "loss": -0.0229,
+      "num_tokens": 11364332.0,
+      "reward": 0.2109375,
+      "reward_std": 0.21146979928016663,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999548196792603,
+      "sampling/importance_sampling_ratio/min": 4.5400451199384406e-05,
+      "sampling/sampling_logp_difference/max": 9.999988555908203,
+      "sampling/sampling_logp_difference/mean": 0.019674532115459442,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 8.318262189277448e-06,
+      "clip_ratio/high_mean": 2.079565547319362e-06,
+      "clip_ratio/low_mean": 3.345101845297904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5530583886611566e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14463.0,
+      "completions/mean_length": 5321.7578125,
+      "completions/mean_terminated_length": 5234.6533203125,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "entropy": 0.9611762389540672,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002321678213775158,
+      "learning_rate": 1e-05,
+      "loss": 0.0089,
+      "num_tokens": 12067365.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22225630283355713,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 5.329983650881331e-06,
+      "sampling/sampling_logp_difference/max": 12.142162322998047,
+      "sampling/sampling_logp_difference/mean": 0.019090529531240463,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.1286541861372825e-05,
+      "clip_ratio/low_min": 4.589008312905207e-06,
+      "clip_ratio/region_mean": 5.1286541861372825e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15906.0,
+      "completions/mean_length": 6747.8125,
+      "completions/mean_terminated_length": 6516.54443359375,
+      "completions/min_length": 65.0,
+      "completions/min_terminated_length": 65.0,
+      "entropy": 0.8531035929918289,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003335036803036928,
+      "learning_rate": 1e-05,
+      "loss": 0.0494,
+      "num_tokens": 12950989.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999262690544128,
+      "sampling/importance_sampling_ratio/min": 0.0024787711445242167,
+      "sampling/sampling_logp_difference/max": 5.999992370605469,
+      "sampling/sampling_logp_difference/mean": 0.017946189269423485,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.059201583255344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.059201583255344e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14653.0,
+      "completions/mean_length": 5237.5390625,
+      "completions/mean_terminated_length": 5060.611328125,
+      "completions/min_length": 31.0,
+      "completions/min_terminated_length": 31.0,
+      "entropy": 0.9604798555374146,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028048555832356215,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 13641594.0,
+      "reward": 0.3359375,
+      "reward_std": 0.27851757407188416,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999921977519989,
+      "sampling/importance_sampling_ratio/min": 0.0003354719083290547,
+      "sampling/sampling_logp_difference/max": 7.999972343444824,
+      "sampling/sampling_logp_difference/mean": 0.01799672283232212,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7391609592086752e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7391609592086752e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14949.0,
+      "completions/mean_length": 5088.71875,
+      "completions/mean_terminated_length": 4999.779296875,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.9381079524755478,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0015588597161695361,
+      "learning_rate": 1e-05,
+      "loss": 0.0593,
+      "num_tokens": 14310022.0,
+      "reward": 0.3515625,
+      "reward_std": 0.24723157286643982,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999968945980072,
+      "sampling/importance_sampling_ratio/min": 0.0008060967666096985,
+      "sampling/sampling_logp_difference/max": 7.123306751251221,
+      "sampling/sampling_logp_difference/mean": 0.018512990325689316,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.4323140021588188e-05,
+      "clip_ratio/high_mean": 3.580785005397047e-06,
+      "clip_ratio/low_mean": 2.3172296550910687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6753081669994572e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15268.0,
+      "completions/max_terminated_length": 15268.0,
+      "completions/mean_length": 5374.375,
+      "completions/mean_terminated_length": 5374.375,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "entropy": 1.198778212070465,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023761435877531767,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 15017710.0,
+      "reward": 0.21875,
+      "reward_std": 0.2432974874973297,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000046730041504,
+      "sampling/importance_sampling_ratio/min": 2.2531810827786103e-05,
+      "sampling/sampling_logp_difference/max": 10.700582504272461,
+      "sampling/sampling_logp_difference/mean": 0.02083735726773739,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 8.891734069038648e-06,
+      "clip_ratio/high_mean": 2.222933517259662e-06,
+      "clip_ratio/low_mean": 3.576970004814939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.799263345172221e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16131.0,
+      "completions/max_terminated_length": 16131.0,
+      "completions/mean_length": 5016.484375,
+      "completions/mean_terminated_length": 5016.484375,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "entropy": 1.0073698610067368,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0024441592395305634,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 15680364.0,
+      "reward": 0.2734375,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 0.0009118849993683398,
+      "sampling/sampling_logp_difference/max": 6.999996662139893,
+      "sampling/sampling_logp_difference/mean": 0.019295595586299896,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 7.065739737299737e-06,
+      "clip_ratio/high_mean": 1.7664349343249341e-06,
+      "clip_ratio/low_mean": 4.2640075662347954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.440651059667289e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14751.0,
+      "completions/mean_length": 6798.171875,
+      "completions/mean_terminated_length": 6408.50390625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0817051529884338,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0035431634169071913,
+      "learning_rate": 1e-05,
+      "loss": -0.0282,
+      "num_tokens": 16572210.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3645517826080322,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 0.00014901062240824103,
+      "sampling/sampling_logp_difference/max": 8.811492919921875,
+      "sampling/sampling_logp_difference/mean": 0.021285930648446083,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 1.8304424429516075e-05,
+      "clip_ratio/high_mean": 4.576106107379019e-06,
+      "clip_ratio/low_mean": 3.600540730985813e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0581513530923985e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14866.0,
+      "completions/mean_length": 5388.6875,
+      "completions/mean_terminated_length": 5302.1103515625,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 1.1402523145079613,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003914100583642721,
+      "learning_rate": 1e-05,
+      "loss": 0.0017,
+      "num_tokens": 17282394.0,
+      "reward": 0.234375,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000356435775757,
+      "sampling/importance_sampling_ratio/min": 4.936015557177598e-06,
+      "sampling/sampling_logp_difference/max": 12.218952178955078,
+      "sampling/sampling_logp_difference/mean": 0.020141229033470154,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 3.6923258903698297e-06,
+      "clip_ratio/high_mean": 9.230814725924574e-07,
+      "clip_ratio/low_mean": 4.0747915363681386e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1670996779430425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15553.0,
+      "completions/mean_length": 5140.625,
+      "completions/mean_terminated_length": 4962.1591796875,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 0.9437280669808388,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026927352882921696,
+      "learning_rate": 1e-05,
+      "loss": 0.0467,
+      "num_tokens": 17963970.0,
+      "reward": 0.3125,
+      "reward_std": 0.3009189963340759,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961256980896,
+      "sampling/importance_sampling_ratio/min": 6.243770621949807e-05,
+      "sampling/sampling_logp_difference/max": 9.681341171264648,
+      "sampling/sampling_logp_difference/mean": 0.02010953240096569,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 9.832700470724376e-06,
+      "clip_ratio/high_mean": 2.458175117681094e-06,
+      "clip_ratio/low_mean": 1.5558874792986899e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.014062596979784e-06,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12937.0,
+      "completions/max_terminated_length": 12937.0,
+      "completions/mean_length": 5454.8515625,
+      "completions/mean_terminated_length": 5454.8515625,
+      "completions/min_length": 717.0,
+      "completions/min_terminated_length": 717.0,
+      "entropy": 1.1385098099708557,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027164353523403406,
+      "learning_rate": 1e-05,
+      "loss": 0.009,
+      "num_tokens": 18680591.0,
+      "reward": 0.296875,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000078558921814,
+      "sampling/importance_sampling_ratio/min": 0.005307729355990887,
+      "sampling/sampling_logp_difference/max": 5.238591194152832,
+      "sampling/sampling_logp_difference/mean": 0.020798511803150177,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.8564560832601273e-05,
+      "clip_ratio/high_mean": 4.641140208150318e-06,
+      "clip_ratio/low_mean": 1.8977171066580922e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.361831138841808e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15341.0,
+      "completions/mean_length": 6053.4296875,
+      "completions/mean_terminated_length": 5972.08642578125,
+      "completions/min_length": 639.0,
+      "completions/min_terminated_length": 639.0,
+      "entropy": 1.006893776357174,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016045555239543319,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 19474438.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26143792271614075,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 4.606551374308765e-05,
+      "sampling/sampling_logp_difference/max": 9.985445976257324,
+      "sampling/sampling_logp_difference/mean": 0.01937020570039749,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 3.951194685214432e-06,
+      "clip_ratio/high_mean": 9.87798671303608e-07,
+      "clip_ratio/low_mean": 3.949826844973359e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.048606700735036e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 5732.6328125,
+      "completions/mean_terminated_length": 5563.56396484375,
+      "completions/min_length": 658.0,
+      "completions/min_terminated_length": 658.0,
+      "entropy": 1.0205800458788872,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017355874879285693,
+      "learning_rate": 1e-05,
+      "loss": 0.0254,
+      "num_tokens": 20229199.0,
+      "reward": 0.2578125,
+      "reward_std": 0.32695505023002625,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966561794281,
+      "sampling/importance_sampling_ratio/min": 9.611312270862982e-05,
+      "sampling/sampling_logp_difference/max": 9.249984741210938,
+      "sampling/sampling_logp_difference/mean": 0.020152747631072998,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.1344701988491579e-05,
+      "clip_ratio/high_mean": 2.8361754971228947e-06,
+      "clip_ratio/low_mean": 6.441893049213832e-05,
+      "clip_ratio/low_min": 3.704581786223571e-06,
+      "clip_ratio/region_mean": 6.72551062734783e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 11633.0,
+      "completions/mean_length": 4968.0546875,
+      "completions/mean_terminated_length": 4786.849609375,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "entropy": 1.0484329834580421,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002361088991165161,
+      "learning_rate": 1e-05,
+      "loss": 0.1348,
+      "num_tokens": 20885790.0,
+      "reward": 0.265625,
+      "reward_std": 0.3180084228515625,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000216960906982,
+      "sampling/importance_sampling_ratio/min": 0.006972009316086769,
+      "sampling/sampling_logp_difference/max": 4.965851783752441,
+      "sampling/sampling_logp_difference/mean": 0.018748482689261436,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.939045106766571e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.939045106766571e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12655.0,
+      "completions/mean_length": 4634.640625,
+      "completions/mean_terminated_length": 4542.1259765625,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.0479918718338013,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002287437906488776,
+      "learning_rate": 1e-05,
+      "loss": -0.0157,
+      "num_tokens": 21497480.0,
+      "reward": 0.34375,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999415874481201,
+      "sampling/importance_sampling_ratio/min": 8.729176670385641e-07,
+      "sampling/sampling_logp_difference/max": 13.951424598693848,
+      "sampling/sampling_logp_difference/mean": 0.019327208399772644,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 2.4600531105534174e-05,
+      "clip_ratio/high_mean": 7.4163915542158065e-06,
+      "clip_ratio/low_mean": 3.8106682723082486e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.552307382255094e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15999.0,
+      "completions/mean_length": 5922.8359375,
+      "completions/mean_terminated_length": 5840.46435546875,
+      "completions/min_length": 565.0,
+      "completions/min_terminated_length": 565.0,
+      "entropy": 1.1925376057624817,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002192641608417034,
+      "learning_rate": 1e-05,
+      "loss": 0.0432,
+      "num_tokens": 22276267.0,
+      "reward": 0.1953125,
+      "reward_std": 0.22461041808128357,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999226987361908,
+      "sampling/importance_sampling_ratio/min": 1.546916053030145e-07,
+      "sampling/sampling_logp_difference/max": 15.681832313537598,
+      "sampling/sampling_logp_difference/mean": 0.026596486568450928,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 1.3442309864331037e-05,
+      "clip_ratio/high_mean": 3.360577466082759e-06,
+      "clip_ratio/low_mean": 2.185166863455379e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5212245873262873e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15082.0,
+      "completions/mean_length": 5835.5,
+      "completions/mean_terminated_length": 5752.44091796875,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 1.229158878326416,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0007279868004843593,
+      "learning_rate": 1e-05,
+      "loss": 0.0081,
+      "num_tokens": 23044019.0,
+      "reward": 0.1796875,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.1796875,
+      "rewards/accuracy_reward/std": 0.3854354918003082,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998399019241333,
+      "sampling/importance_sampling_ratio/min": 1.414701245039396e-07,
+      "sampling/sampling_logp_difference/max": 15.771177291870117,
+      "sampling/sampling_logp_difference/mean": 0.020945575088262558,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.277465526072774e-05,
+      "clip_ratio/high_mean": 3.193663815181935e-06,
+      "clip_ratio/low_mean": 3.348547249970579e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.667913586014038e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14729.0,
+      "completions/max_terminated_length": 14729.0,
+      "completions/mean_length": 5070.1484375,
+      "completions/mean_terminated_length": 5070.1484375,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "entropy": 1.0323031097650528,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022168844006955624,
+      "learning_rate": 1e-05,
+      "loss": 0.0657,
+      "num_tokens": 23714878.0,
+      "reward": 0.3515625,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999499917030334,
+      "sampling/importance_sampling_ratio/min": 0.0037885017227381468,
+      "sampling/sampling_logp_difference/max": 5.575784683227539,
+      "sampling/sampling_logp_difference/mean": 0.01919984258711338,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 1.2069132026226725e-05,
+      "clip_ratio/high_mean": 3.0172830065566814e-06,
+      "clip_ratio/low_mean": 3.323697501400602e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6254257338441676e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15334.0,
+      "completions/mean_length": 4792.2578125,
+      "completions/mean_terminated_length": 4700.984375,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 0.9981634542346001,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001841123914346099,
+      "learning_rate": 1e-05,
+      "loss": 0.0577,
+      "num_tokens": 24347119.0,
+      "reward": 0.4375,
+      "reward_std": 0.3524719774723053,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999489784240723,
+      "sampling/importance_sampling_ratio/min": 4.2607393879734445e-06,
+      "sampling/sampling_logp_difference/max": 12.366067886352539,
+      "sampling/sampling_logp_difference/mean": 0.018039174377918243,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.3947896150057204e-05,
+      "clip_ratio/high_mean": 4.6235029458330246e-06,
+      "clip_ratio/low_mean": 4.1055162455450045e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5678665628656745e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16032.0,
+      "completions/mean_length": 6841.375,
+      "completions/mean_terminated_length": 6453.46337890625,
+      "completions/min_length": 652.0,
+      "completions/min_terminated_length": 652.0,
+      "entropy": 1.0972845032811165,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00202017929404974,
+      "learning_rate": 1e-05,
+      "loss": -0.0092,
+      "num_tokens": 25241911.0,
+      "reward": 0.25,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999304413795471,
+      "sampling/importance_sampling_ratio/min": 0.00026355183217674494,
+      "sampling/sampling_logp_difference/max": 8.241260528564453,
+      "sampling/sampling_logp_difference/mean": 0.02115095779299736,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 4.14414989791112e-06,
+      "clip_ratio/high_mean": 1.03603747447778e-06,
+      "clip_ratio/low_mean": 4.4157833031022165e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.519387027812627e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16218.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 5645.6640625,
+      "completions/mean_terminated_length": 5645.6640625,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.0653726011514664,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003633195301517844,
+      "learning_rate": 1e-05,
+      "loss": -0.0409,
+      "num_tokens": 25982588.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999817967414856,
+      "sampling/importance_sampling_ratio/min": 0.0007106869597919285,
+      "sampling/sampling_logp_difference/max": 7.249278545379639,
+      "sampling/sampling_logp_difference/mean": 0.02010509930551052,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 7.0509927354578394e-06,
+      "clip_ratio/high_mean": 1.7627481838644599e-06,
+      "clip_ratio/low_mean": 3.606558789215342e-05,
+      "clip_ratio/low_min": 3.3240260108868824e-06,
+      "clip_ratio/region_mean": 3.782833596233104e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 7335.1875,
+      "completions/mean_terminated_length": 7118.01611328125,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "entropy": 0.9340982511639595,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017444937257096171,
+      "learning_rate": 1e-05,
+      "loss": 0.0434,
+      "num_tokens": 26946156.0,
+      "reward": 0.171875,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.171875,
+      "rewards/accuracy_reward/std": 0.3787541687488556,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998713731765747,
+      "sampling/importance_sampling_ratio/min": 2.5868248485494405e-05,
+      "sampling/sampling_logp_difference/max": 10.562494277954102,
+      "sampling/sampling_logp_difference/mean": 0.01965884119272232,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 1.1849869679281255e-05,
+      "clip_ratio/high_mean": 2.962467419820314e-06,
+      "clip_ratio/low_mean": 2.5232500775018707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8194967853778508e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14151.0,
+      "completions/mean_length": 5998.8671875,
+      "completions/mean_terminated_length": 5917.09423828125,
+      "completions/min_length": 752.0,
+      "completions/min_terminated_length": 752.0,
+      "entropy": 0.975816160440445,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020293404813855886,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 27733059.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2908889353275299,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999889612197876,
+      "sampling/importance_sampling_ratio/min": 0.00892679300159216,
+      "sampling/sampling_logp_difference/max": 4.718698024749756,
+      "sampling/sampling_logp_difference/mean": 0.01972467266023159,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.05586318315909e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.05586318315909e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15984.0,
+      "completions/max_terminated_length": 15984.0,
+      "completions/mean_length": 5599.4375,
+      "completions/mean_terminated_length": 5599.4375,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "entropy": 1.006210096180439,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035929102450609207,
+      "learning_rate": 1e-05,
+      "loss": 0.02,
+      "num_tokens": 28468843.0,
+      "reward": 0.2578125,
+      "reward_std": 0.3306073546409607,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805688858032,
+      "sampling/importance_sampling_ratio/min": 0.009500927291810513,
+      "sampling/sampling_logp_difference/max": 4.656365871429443,
+      "sampling/sampling_logp_difference/mean": 0.019885972142219543,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 1.1638733667496126e-05,
+      "clip_ratio/high_mean": 2.9096834168740315e-06,
+      "clip_ratio/low_mean": 3.210125066743785e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5010934084311884e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14843.0,
+      "completions/max_terminated_length": 14843.0,
+      "completions/mean_length": 5035.7734375,
+      "completions/mean_terminated_length": 5035.7734375,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 1.004905492067337,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023170222993940115,
+      "learning_rate": 1e-05,
+      "loss": 0.043,
+      "num_tokens": 29133270.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3037971258163452,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 4.264977542334236e-05,
+      "sampling/sampling_logp_difference/max": 10.062488555908203,
+      "sampling/sampling_logp_difference/mean": 0.019529584795236588,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.932905413734261e-06,
+      "clip_ratio/high_mean": 2.4832263534335652e-06,
+      "clip_ratio/low_mean": 4.655256179830758e-05,
+      "clip_ratio/low_min": 1.288991325054667e-05,
+      "clip_ratio/region_mean": 4.903578792436747e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 4865.6171875,
+      "completions/mean_terminated_length": 4774.92138671875,
+      "completions/min_length": 687.0,
+      "completions/min_terminated_length": 687.0,
+      "entropy": 0.9472262933850288,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0024069426581263542,
+      "learning_rate": 1e-05,
+      "loss": 0.0435,
+      "num_tokens": 29774973.0,
+      "reward": 0.4296875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000076293945312,
+      "sampling/importance_sampling_ratio/min": 4.94040648391092e-07,
+      "sampling/sampling_logp_difference/max": 14.520648002624512,
+      "sampling/sampling_logp_difference/mean": 0.017961984500288963,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.4300524526333902e-05,
+      "clip_ratio/high_mean": 4.549106392914837e-06,
+      "clip_ratio/low_mean": 8.310655789500743e-05,
+      "clip_ratio/low_min": 3.895901500072796e-06,
+      "clip_ratio/region_mean": 8.765566417423543e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14604.0,
+      "completions/max_terminated_length": 14604.0,
+      "completions/mean_length": 5928.3828125,
+      "completions/mean_terminated_length": 5928.3828125,
+      "completions/min_length": 443.0,
+      "completions/min_terminated_length": 443.0,
+      "entropy": 0.9451013877987862,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019357368582859635,
+      "learning_rate": 1e-05,
+      "loss": 0.0659,
+      "num_tokens": 30557014.0,
+      "reward": 0.2734375,
+      "reward_std": 0.3227117359638214,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000040054321289,
+      "sampling/importance_sampling_ratio/min": 4.787445504916832e-06,
+      "sampling/sampling_logp_difference/max": 12.249513626098633,
+      "sampling/sampling_logp_difference/mean": 0.020681140944361687,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.6088630218291655e-05,
+      "clip_ratio/high_mean": 4.022157554572914e-06,
+      "clip_ratio/low_mean": 4.4498895476863254e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.852105257668882e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15935.0,
+      "completions/max_terminated_length": 15935.0,
+      "completions/mean_length": 5253.890625,
+      "completions/mean_terminated_length": 5253.890625,
+      "completions/min_length": 27.0,
+      "completions/min_terminated_length": 27.0,
+      "entropy": 1.0573822036385536,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027430339250713587,
+      "learning_rate": 1e-05,
+      "loss": -0.0295,
+      "num_tokens": 31252752.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3564237058162689,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 0.0019366396591067314,
+      "sampling/sampling_logp_difference/max": 6.246800899505615,
+      "sampling/sampling_logp_difference/mean": 0.019426241517066956,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.80760021066817e-05,
+      "clip_ratio/high_mean": 4.519000526670425e-06,
+      "clip_ratio/low_mean": 2.491120585546014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9430206382130564e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12610.0,
+      "completions/mean_length": 4434.7890625,
+      "completions/mean_terminated_length": 4340.70068359375,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "entropy": 1.0309192687273026,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027177443262189627,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 31839885.0,
+      "reward": 0.359375,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999918520450592,
+      "sampling/importance_sampling_ratio/min": 0.0010315371910110116,
+      "sampling/sampling_logp_difference/max": 6.876705169677734,
+      "sampling/sampling_logp_difference/mean": 0.01883832737803459,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9404036808955425e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404036808955425e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14982.0,
+      "completions/mean_length": 6810.578125,
+      "completions/mean_terminated_length": 6735.19677734375,
+      "completions/min_length": 1260.0,
+      "completions/min_terminated_length": 1260.0,
+      "entropy": 1.134837955236435,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025250029284507036,
+      "learning_rate": 1e-05,
+      "loss": -0.0016,
+      "num_tokens": 32734551.0,
+      "reward": 0.2421875,
+      "reward_std": 0.21436068415641785,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000019073486328,
+      "sampling/importance_sampling_ratio/min": 0.0014875066699460149,
+      "sampling/sampling_logp_difference/max": 6.510653972625732,
+      "sampling/sampling_logp_difference/mean": 0.02130994386970997,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 1.1104832083219662e-05,
+      "clip_ratio/high_mean": 2.7762080208049156e-06,
+      "clip_ratio/low_mean": 2.9984376055836037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.276058407664095e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16351.0,
+      "completions/mean_length": 6623.3359375,
+      "completions/mean_terminated_length": 6308.4755859375,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "entropy": 0.990560457110405,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018365891883149743,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 33600498.0,
+      "reward": 0.3203125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 5.727278562517313e-07,
+      "sampling/sampling_logp_difference/max": 14.372855186462402,
+      "sampling/sampling_logp_difference/mean": 0.019745903089642525,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 1.5849275314394617e-05,
+      "clip_ratio/high_mean": 3.962318828598654e-06,
+      "clip_ratio/low_mean": 2.2989276772023004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.695159548693482e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14530.0,
+      "completions/mean_length": 5414.046875,
+      "completions/mean_terminated_length": 5239.9208984375,
+      "completions/min_length": 534.0,
+      "completions/min_terminated_length": 534.0,
+      "entropy": 1.213307112455368,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016015933360904455,
+      "learning_rate": 1e-05,
+      "loss": 0.0239,
+      "num_tokens": 34322776.0,
+      "reward": 0.2109375,
+      "reward_std": 0.2369818240404129,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999943733215332,
+      "sampling/importance_sampling_ratio/min": 0.0006993028800934553,
+      "sampling/sampling_logp_difference/max": 7.2654266357421875,
+      "sampling/sampling_logp_difference/mean": 0.021634424105286598,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 3.0635404527856736e-05,
+      "clip_ratio/high_mean": 7.658851131964184e-06,
+      "clip_ratio/low_mean": 4.565159474623215e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3310446219256846e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16066.0,
+      "completions/max_terminated_length": 16066.0,
+      "completions/mean_length": 6082.1015625,
+      "completions/mean_terminated_length": 6082.1015625,
+      "completions/min_length": 475.0,
+      "completions/min_terminated_length": 475.0,
+      "entropy": 0.8880708515644073,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002024279674515128,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 35118853.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3619031310081482,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 3.121717236354016e-05,
+      "sampling/sampling_logp_difference/max": 10.374542236328125,
+      "sampling/sampling_logp_difference/mean": 0.01861739531159401,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.718443036224926e-05,
+      "clip_ratio/high_mean": 4.296107590562315e-06,
+      "clip_ratio/low_mean": 3.4419200915181136e-05,
+      "clip_ratio/low_min": 3.7744964629382594e-06,
+      "clip_ratio/region_mean": 3.871530816468294e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16287.0,
+      "completions/mean_length": 6382.3828125,
+      "completions/mean_terminated_length": 6059.75,
+      "completions/min_length": 670.0,
+      "completions/min_terminated_length": 670.0,
+      "entropy": 0.8597949668765068,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002679568249732256,
+      "learning_rate": 1e-05,
+      "loss": 0.0749,
+      "num_tokens": 35956350.0,
+      "reward": 0.46875,
+      "reward_std": 0.39530590176582336,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000333786010742,
+      "sampling/importance_sampling_ratio/min": 0.0005964707233943045,
+      "sampling/sampling_logp_difference/max": 7.424480438232422,
+      "sampling/sampling_logp_difference/mean": 0.01830567792057991,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 7.470714990631677e-06,
+      "clip_ratio/high_mean": 1.8676787476579193e-06,
+      "clip_ratio/low_mean": 2.8441645326893195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0309323619803763e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16314.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 6112.7890625,
+      "completions/mean_terminated_length": 6112.7890625,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.9591199606657028,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0011262348853051662,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 36756171.0,
+      "reward": 0.359375,
+      "reward_std": 0.2743412256240845,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999510049819946,
+      "sampling/importance_sampling_ratio/min": 1.2219889867992606e-05,
+      "sampling/sampling_logp_difference/max": 11.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01950032450258732,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 3.7807756143592997e-06,
+      "clip_ratio/high_mean": 9.451939035898249e-07,
+      "clip_ratio/low_mean": 3.906526939090327e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.001046335133651e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16169.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 6744.390625,
+      "completions/mean_terminated_length": 6744.390625,
+      "completions/min_length": 719.0,
+      "completions/min_terminated_length": 719.0,
+      "entropy": 1.061469852924347,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002305408474057913,
+      "learning_rate": 1e-05,
+      "loss": 0.0496,
+      "num_tokens": 37643573.0,
+      "reward": 0.234375,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986124992371,
+      "sampling/importance_sampling_ratio/min": 9.516369573248085e-06,
+      "sampling/sampling_logp_difference/max": 11.56249713897705,
+      "sampling/sampling_logp_difference/mean": 0.020016517490148544,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.3845812645740807e-05,
+      "clip_ratio/high_mean": 3.4614531614352018e-06,
+      "clip_ratio/low_mean": 2.3906941066798026e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7368394228233228e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15606.0,
+      "completions/max_terminated_length": 15606.0,
+      "completions/mean_length": 5723.0859375,
+      "completions/mean_terminated_length": 5723.0859375,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "entropy": 1.0918374806642532,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002079444006085396,
+      "learning_rate": 1e-05,
+      "loss": 0.0332,
+      "num_tokens": 38399000.0,
+      "reward": 0.34375,
+      "reward_std": 0.28353503346443176,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999094009399414,
+      "sampling/importance_sampling_ratio/min": 0.00247886567376554,
+      "sampling/sampling_logp_difference/max": 5.9999542236328125,
+      "sampling/sampling_logp_difference/mean": 0.02025545760989189,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.6330426660715602e-05,
+      "clip_ratio/high_mean": 4.082606665178901e-06,
+      "clip_ratio/low_mean": 4.608668984928954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0169297423963144e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15124.0,
+      "completions/mean_length": 6075.078125,
+      "completions/mean_terminated_length": 5827.6640625,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "entropy": 1.0526456609368324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002548371907323599,
+      "learning_rate": 1e-05,
+      "loss": 0.0005,
+      "num_tokens": 39195762.0,
+      "reward": 0.28125,
+      "reward_std": 0.2903746962547302,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 0.0003802210558205843,
+      "sampling/sampling_logp_difference/max": 7.874757766723633,
+      "sampling/sampling_logp_difference/mean": 0.02132822386920452,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 1.2557530681078788e-05,
+      "clip_ratio/high_mean": 3.139382670269697e-06,
+      "clip_ratio/low_mean": 5.579355536156072e-05,
+      "clip_ratio/low_min": 6.314919346550596e-06,
+      "clip_ratio/region_mean": 5.893293734970939e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14981.0,
+      "completions/mean_length": 6273.203125,
+      "completions/mean_terminated_length": 6193.59033203125,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 0.9629805982112885,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001929077785462141,
+      "learning_rate": 1e-05,
+      "loss": 0.0575,
+      "num_tokens": 40016988.0,
+      "reward": 0.3828125,
+      "reward_std": 0.35718512535095215,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000178813934326,
+      "sampling/importance_sampling_ratio/min": 0.004126251209527254,
+      "sampling/sampling_logp_difference/max": 5.490386009216309,
+      "sampling/sampling_logp_difference/mean": 0.01974763534963131,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 5.326855898601934e-06,
+      "clip_ratio/high_mean": 1.3317139746504836e-06,
+      "clip_ratio/low_mean": 1.2195182989671594e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.3526897078008915e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12113.0,
+      "completions/mean_length": 4658.1640625,
+      "completions/mean_terminated_length": 4565.83447265625,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "entropy": 0.950105108320713,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002910251496359706,
+      "learning_rate": 1e-05,
+      "loss": 0.0068,
+      "num_tokens": 40632681.0,
+      "reward": 0.390625,
+      "reward_std": 0.28353503346443176,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000574588775635,
+      "sampling/importance_sampling_ratio/min": 0.0017036369536072016,
+      "sampling/sampling_logp_difference/max": 6.374989986419678,
+      "sampling/sampling_logp_difference/mean": 0.018849056214094162,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 1.1988173810095759e-05,
+      "clip_ratio/high_mean": 2.9970434525239398e-06,
+      "clip_ratio/low_mean": 2.1473538311056473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4470581195146224e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15638.0,
+      "completions/mean_length": 6582.953125,
+      "completions/mean_terminated_length": 5756.94921875,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.8884479179978371,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018201791681349277,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 41498939.0,
+      "reward": 0.328125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000053644180298,
+      "sampling/importance_sampling_ratio/min": 0.00011687594087561592,
+      "sampling/sampling_logp_difference/max": 9.054397583007812,
+      "sampling/sampling_logp_difference/mean": 0.018637457862496376,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9767679873439192e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9767679873439192e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15902.0,
+      "completions/mean_length": 6408.4453125,
+      "completions/mean_terminated_length": 6250.103515625,
+      "completions/min_length": 360.0,
+      "completions/min_terminated_length": 360.0,
+      "entropy": 1.0724121406674385,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027558596339076757,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 42338436.0,
+      "reward": 0.2578125,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000157356262207,
+      "sampling/importance_sampling_ratio/min": 2.144563404726796e-05,
+      "sampling/sampling_logp_difference/max": 10.74998950958252,
+      "sampling/sampling_logp_difference/mean": 0.020520739257335663,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.615732708160067e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.615732708160067e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 4527.8984375,
+      "completions/mean_terminated_length": 4243.35205078125,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 0.9734272584319115,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018782512051984668,
+      "learning_rate": 1e-05,
+      "loss": 0.0726,
+      "num_tokens": 42936215.0,
+      "reward": 0.4375,
+      "reward_std": 0.2890765368938446,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626874923706,
+      "sampling/importance_sampling_ratio/min": 6.564679324583267e-07,
+      "sampling/sampling_logp_difference/max": 14.2363920211792,
+      "sampling/sampling_logp_difference/mean": 0.018541917204856873,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 1.9634914679045323e-05,
+      "clip_ratio/high_mean": 4.908728669761331e-06,
+      "clip_ratio/low_mean": 3.605886263358116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.096759084859514e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14340.0,
+      "completions/max_terminated_length": 14340.0,
+      "completions/mean_length": 5389.609375,
+      "completions/mean_terminated_length": 5389.609375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.035320296883583,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003410179866477847,
+      "learning_rate": 1e-05,
+      "loss": 0.1109,
+      "num_tokens": 43643733.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3040394186973572,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999303221702576,
+      "sampling/importance_sampling_ratio/min": 7.063792872941121e-05,
+      "sampling/sampling_logp_difference/max": 9.557943344116211,
+      "sampling/sampling_logp_difference/mean": 0.01980186253786087,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 3.324525869174977e-05,
+      "clip_ratio/high_mean": 9.664479989623942e-06,
+      "clip_ratio/low_mean": 3.5182122701371554e-05,
+      "clip_ratio/low_min": 1.1718383575498592e-05,
+      "clip_ratio/region_mean": 4.484660291836917e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 5338.90625,
+      "completions/mean_terminated_length": 5251.93701171875,
+      "completions/min_length": 630.0,
+      "completions/min_terminated_length": 630.0,
+      "entropy": 0.9680418893694878,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0013158825458958745,
+      "learning_rate": 1e-05,
+      "loss": 0.0851,
+      "num_tokens": 44345177.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999946117401123,
+      "sampling/importance_sampling_ratio/min": 1.941789093962143e-07,
+      "sampling/sampling_logp_difference/max": 15.454485893249512,
+      "sampling/sampling_logp_difference/mean": 0.019034607335925102,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 1.678188709774986e-05,
+      "clip_ratio/high_mean": 4.195471774437465e-06,
+      "clip_ratio/low_mean": 2.326147910025611e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.74569506473199e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15030.0,
+      "completions/mean_length": 5197.5859375,
+      "completions/mean_terminated_length": 5020.02392578125,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 0.9385635256767273,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023898824583739042,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 45029716.0,
+      "reward": 0.328125,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999666213989258,
+      "sampling/importance_sampling_ratio/min": 0.0031843625474721193,
+      "sampling/sampling_logp_difference/max": 5.749503135681152,
+      "sampling/sampling_logp_difference/mean": 0.017856482416391373,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 2.8269179438211722e-05,
+      "clip_ratio/high_mean": 7.0672948595529306e-06,
+      "clip_ratio/low_mean": 4.551043662104348e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2577731821656926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6353.9375,
+      "completions/mean_terminated_length": 6194.73046875,
+      "completions/min_length": 1201.0,
+      "completions/min_terminated_length": 1201.0,
+      "entropy": 0.9195960611104965,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002777763642370701,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "num_tokens": 45861388.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999842643737793,
+      "sampling/importance_sampling_ratio/min": 0.00033647287636995316,
+      "sampling/sampling_logp_difference/max": 7.996993064880371,
+      "sampling/sampling_logp_difference/mean": 0.019472671672701836,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 8.376483492611442e-06,
+      "clip_ratio/high_mean": 2.0941208731528604e-06,
+      "clip_ratio/low_mean": 1.1372792755537375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.3466913628690236e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 7125.265625,
+      "completions/mean_terminated_length": 6669.91748046875,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 0.9209358915686607,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012601700145751238,
+      "learning_rate": 1e-05,
+      "loss": 0.0263,
+      "num_tokens": 46793902.0,
+      "reward": 0.265625,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999377727508545,
+      "sampling/importance_sampling_ratio/min": 2.034899989666883e-05,
+      "sampling/sampling_logp_difference/max": 10.802478790283203,
+      "sampling/sampling_logp_difference/mean": 0.0191169623285532,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 6.630596089962637e-06,
+      "clip_ratio/high_mean": 1.6576490224906593e-06,
+      "clip_ratio/low_mean": 3.7912880316071096e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.957052945224859e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14620.0,
+      "completions/mean_length": 5895.4453125,
+      "completions/mean_terminated_length": 5812.8583984375,
+      "completions/min_length": 708.0,
+      "completions/min_terminated_length": 708.0,
+      "entropy": 0.9421789273619652,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036641336046159267,
+      "learning_rate": 1e-05,
+      "loss": 0.0572,
+      "num_tokens": 47567543.0,
+      "reward": 0.359375,
+      "reward_std": 0.2937847673892975,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999691247940063,
+      "sampling/importance_sampling_ratio/min": 2.1912494048592634e-05,
+      "sampling/sampling_logp_difference/max": 10.728453636169434,
+      "sampling/sampling_logp_difference/mean": 0.018009435385465622,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.6876661106834945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6876661106834945e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13935.0,
+      "completions/mean_length": 4643.9921875,
+      "completions/mean_terminated_length": 4551.55126953125,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "entropy": 1.1234809532761574,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003017786890268326,
+      "learning_rate": 1e-05,
+      "loss": 0.0403,
+      "num_tokens": 48180998.0,
+      "reward": 0.328125,
+      "reward_std": 0.2198973000049591,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999090433120728,
+      "sampling/importance_sampling_ratio/min": 1.4786172641834128e-06,
+      "sampling/sampling_logp_difference/max": 13.424403190612793,
+      "sampling/sampling_logp_difference/mean": 0.0194530226290226,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 1.1807285773102194e-05,
+      "clip_ratio/high_mean": 2.9518214432755485e-06,
+      "clip_ratio/low_mean": 1.7793156246170838e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0744977689446387e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16243.0,
+      "completions/mean_length": 7134.5546875,
+      "completions/mean_terminated_length": 6679.66357421875,
+      "completions/min_length": 765.0,
+      "completions/min_terminated_length": 765.0,
+      "entropy": 1.0891609117388725,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021707366686314344,
+      "learning_rate": 1e-05,
+      "loss": 0.0079,
+      "num_tokens": 49113837.0,
+      "reward": 0.2578125,
+      "reward_std": 0.21778056025505066,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000441074371338,
+      "sampling/importance_sampling_ratio/min": 5.227705059951404e-06,
+      "sampling/sampling_logp_difference/max": 12.161538124084473,
+      "sampling/sampling_logp_difference/mean": 0.021074742078781128,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 1.785590688996308e-05,
+      "clip_ratio/high_mean": 4.46397672249077e-06,
+      "clip_ratio/low_mean": 4.4942946374249004e-05,
+      "clip_ratio/low_min": 4.320774223742774e-06,
+      "clip_ratio/region_mean": 4.940692338095687e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16178.0,
+      "completions/mean_length": 6770.3984375,
+      "completions/mean_terminated_length": 6694.70068359375,
+      "completions/min_length": 488.0,
+      "completions/min_terminated_length": 488.0,
+      "entropy": 1.14402187615633,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003913953434675932,
+      "learning_rate": 1e-05,
+      "loss": -0.0645,
+      "num_tokens": 49999984.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999771118164062,
+      "sampling/importance_sampling_ratio/min": 0.00039836866199038923,
+      "sampling/sampling_logp_difference/max": 7.828132629394531,
+      "sampling/sampling_logp_difference/mean": 0.021658796817064285,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 6.990269412199268e-06,
+      "clip_ratio/high_mean": 3.4296645026188344e-06,
+      "clip_ratio/low_mean": 3.069889220341793e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.412855670603676e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 6743.3359375,
+      "completions/mean_terminated_length": 5926.33056640625,
+      "completions/min_length": 1195.0,
+      "completions/min_terminated_length": 1195.0,
+      "entropy": 0.8485476225614548,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0015872148796916008,
+      "learning_rate": 1e-05,
+      "loss": 0.0107,
+      "num_tokens": 50881939.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998895525932312,
+      "sampling/importance_sampling_ratio/min": 0.008705966174602509,
+      "sampling/sampling_logp_difference/max": 4.743746757507324,
+      "sampling/sampling_logp_difference/mean": 0.017901426181197166,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.300406438531354e-05,
+      "clip_ratio/high_mean": 3.251016096328385e-06,
+      "clip_ratio/low_mean": 3.055216484426637e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.380318116796843e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15756.0,
+      "completions/max_terminated_length": 15756.0,
+      "completions/mean_length": 5952.0234375,
+      "completions/mean_terminated_length": 5952.0234375,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 1.1280141845345497,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037659234367311,
+      "learning_rate": 1e-05,
+      "loss": 0.1156,
+      "num_tokens": 51664814.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009536743164,
+      "sampling/importance_sampling_ratio/min": 0.0037554434966295958,
+      "sampling/sampling_logp_difference/max": 5.5845489501953125,
+      "sampling/sampling_logp_difference/mean": 0.01998155191540718,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 9.465616585657699e-06,
+      "clip_ratio/high_mean": 2.3664041464144248e-06,
+      "clip_ratio/low_mean": 3.98842666982091e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2250670958310366e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15301.0,
+      "completions/mean_length": 5533.171875,
+      "completions/mean_terminated_length": 5360.93701171875,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "entropy": 0.9313871935009956,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003427086630836129,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 52391076.0,
+      "reward": 0.421875,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445080757141,
+      "sampling/importance_sampling_ratio/min": 2.0617162590497173e-05,
+      "sampling/sampling_logp_difference/max": 10.789386749267578,
+      "sampling/sampling_logp_difference/mean": 0.019165968522429466,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.4208102129487088e-05,
+      "clip_ratio/high_mean": 3.552025532371772e-06,
+      "clip_ratio/low_mean": 3.275496806054434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630699370660295e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16334.0,
+      "completions/mean_length": 7481.671875,
+      "completions/mean_terminated_length": 7194.5,
+      "completions/min_length": 1003.0,
+      "completions/min_terminated_length": 1003.0,
+      "entropy": 0.9429318532347679,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002845548093318939,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 53366314.0,
+      "reward": 0.34375,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999762773513794,
+      "sampling/importance_sampling_ratio/min": 0.00124227290507406,
+      "sampling/sampling_logp_difference/max": 6.690812587738037,
+      "sampling/sampling_logp_difference/mean": 0.019388489425182343,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 2.2517269826494157e-05,
+      "clip_ratio/high_mean": 5.629317456623539e-06,
+      "clip_ratio/low_mean": 6.0563696024473757e-05,
+      "clip_ratio/low_min": 6.892558758408995e-06,
+      "clip_ratio/region_mean": 6.61930134810973e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16230.0,
+      "completions/mean_length": 6112.03125,
+      "completions/mean_terminated_length": 5865.50439453125,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.9013729467988014,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017839284846559167,
+      "learning_rate": 1e-05,
+      "loss": 0.0758,
+      "num_tokens": 54165910.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.0015448861522600055,
+      "sampling/sampling_logp_difference/max": 6.472805023193359,
+      "sampling/sampling_logp_difference/mean": 0.019030068069696426,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 7.458678737748414e-06,
+      "clip_ratio/high_mean": 1.8646696844371036e-06,
+      "clip_ratio/low_mean": 2.7964613764197566e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.982928344863467e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15853.0,
+      "completions/max_terminated_length": 15853.0,
+      "completions/mean_length": 4590.625,
+      "completions/mean_terminated_length": 4590.625,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 0.8759121596813202,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035294899716973305,
+      "learning_rate": 1e-05,
+      "loss": 0.0802,
+      "num_tokens": 54771526.0,
+      "reward": 0.4375,
+      "reward_std": 0.41268986463546753,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999133944511414,
+      "sampling/importance_sampling_ratio/min": 0.0007238102261908352,
+      "sampling/sampling_logp_difference/max": 7.230981349945068,
+      "sampling/sampling_logp_difference/mean": 0.017765047028660774,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.460266958019929e-05,
+      "clip_ratio/high_mean": 3.6506673950498225e-06,
+      "clip_ratio/low_mean": 3.319967777315469e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.685034562295186e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 5152.234375,
+      "completions/mean_terminated_length": 5063.79541015625,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "entropy": 0.8593896478414536,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003597866278141737,
+      "learning_rate": 1e-05,
+      "loss": 0.048,
+      "num_tokens": 55449820.0,
+      "reward": 0.4453125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999961853027344,
+      "sampling/importance_sampling_ratio/min": 0.0005548940971493721,
+      "sampling/sampling_logp_difference/max": 7.49673318862915,
+      "sampling/sampling_logp_difference/mean": 0.018061507493257523,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.4012571227794979e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.4012571227794979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16195.0,
+      "completions/mean_length": 6629.2734375,
+      "completions/mean_terminated_length": 6474.43701171875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.106893703341484,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0014848506543785334,
+      "learning_rate": 1e-05,
+      "loss": -0.0128,
+      "num_tokens": 56318135.0,
+      "reward": 0.2109375,
+      "reward_std": 0.190433531999588,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999266862869263,
+      "sampling/importance_sampling_ratio/min": 1.3627897033074987e-08,
+      "sampling/sampling_logp_difference/max": 18.111146926879883,
+      "sampling/sampling_logp_difference/mean": 0.021642908453941345,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.388627917251142e-05,
+      "clip_ratio/low_min": 5.944737495156005e-06,
+      "clip_ratio/region_mean": 4.388627917251142e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14845.0,
+      "completions/max_terminated_length": 14845.0,
+      "completions/mean_length": 5802.8828125,
+      "completions/mean_terminated_length": 5802.8828125,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9879340082406998,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003300516167655587,
+      "learning_rate": 1e-05,
+      "loss": 0.0321,
+      "num_tokens": 57078080.0,
+      "reward": 0.3125,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000663995742798,
+      "sampling/importance_sampling_ratio/min": 0.0010333232348784804,
+      "sampling/sampling_logp_difference/max": 6.874975204467773,
+      "sampling/sampling_logp_difference/mean": 0.01895206607878208,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.071953920472879e-05,
+      "clip_ratio/high_mean": 2.6798848011821974e-06,
+      "clip_ratio/low_mean": 4.836337473079766e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.104325930460618e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14713.0,
+      "completions/max_terminated_length": 14713.0,
+      "completions/mean_length": 5293.1640625,
+      "completions/mean_terminated_length": 5293.1640625,
+      "completions/min_length": 344.0,
+      "completions/min_terminated_length": 344.0,
+      "entropy": 0.9724989607930183,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002898244420066476,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 57774093.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0031829492654651403,
+      "sampling/sampling_logp_difference/max": 5.7499470710754395,
+      "sampling/sampling_logp_difference/mean": 0.019694382324814796,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.102629304725269e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.102629304725269e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13987.0,
+      "completions/mean_length": 5771.5625,
+      "completions/mean_terminated_length": 5340.16259765625,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "entropy": 0.9740649163722992,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002261349931359291,
+      "learning_rate": 1e-05,
+      "loss": 0.0738,
+      "num_tokens": 58531293.0,
+      "reward": 0.25,
+      "reward_std": 0.26120057702064514,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999984502792358,
+      "sampling/importance_sampling_ratio/min": 7.037367322482169e-05,
+      "sampling/sampling_logp_difference/max": 9.561691284179688,
+      "sampling/sampling_logp_difference/mean": 0.019619958475232124,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 1.241475092683686e-05,
+      "clip_ratio/high_mean": 3.955232841690304e-06,
+      "clip_ratio/low_mean": 3.313706986318721e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.709230361437221e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16331.0,
+      "completions/mean_length": 6832.59375,
+      "completions/mean_terminated_length": 6524.48388671875,
+      "completions/min_length": 674.0,
+      "completions/min_terminated_length": 674.0,
+      "entropy": 0.8907959461212158,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002895365934818983,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 59425137.0,
+      "reward": 0.4296875,
+      "reward_std": 0.36797165870666504,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000522136688232,
+      "sampling/importance_sampling_ratio/min": 0.000623974425252527,
+      "sampling/sampling_logp_difference/max": 7.379401206970215,
+      "sampling/sampling_logp_difference/mean": 0.019336842000484467,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 1.309858976128453e-05,
+      "clip_ratio/high_mean": 3.2746474403211323e-06,
+      "clip_ratio/low_mean": 3.091655224807255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.419119957470684e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15537.0,
+      "completions/mean_length": 5741.3515625,
+      "completions/mean_terminated_length": 5572.4208984375,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.9363748207688332,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003053537104278803,
+      "learning_rate": 1e-05,
+      "loss": 0.0503,
+      "num_tokens": 60177006.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999263882637024,
+      "sampling/importance_sampling_ratio/min": 0.0009319739765487611,
+      "sampling/sampling_logp_difference/max": 6.978205680847168,
+      "sampling/sampling_logp_difference/mean": 0.01948600634932518,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 2.1969835415802663e-05,
+      "clip_ratio/high_mean": 7.355770890171698e-06,
+      "clip_ratio/low_mean": 3.6011779457112425e-05,
+      "clip_ratio/low_min": 4.118887773074675e-06,
+      "clip_ratio/region_mean": 4.336755046097096e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15546.0,
+      "completions/mean_length": 6333.078125,
+      "completions/mean_terminated_length": 6091.8564453125,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 0.8286701366305351,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001936097047291696,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 61007192.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3135277032852173,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999134540557861,
+      "sampling/importance_sampling_ratio/min": 0.00018122897017747164,
+      "sampling/sampling_logp_difference/max": 8.61574935913086,
+      "sampling/sampling_logp_difference/mean": 0.017766552045941353,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 3.815369746007491e-05,
+      "clip_ratio/high_mean": 1.1110751302112476e-05,
+      "clip_ratio/low_mean": 5.337692005014105e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.448767180700088e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14444.0,
+      "completions/mean_length": 4467.71875,
+      "completions/mean_terminated_length": 4373.8896484375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0210246965289116,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00340029364451766,
+      "learning_rate": 1e-05,
+      "loss": -0.0143,
+      "num_tokens": 61606900.0,
+      "reward": 0.359375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999921441078186,
+      "sampling/importance_sampling_ratio/min": 0.004546399228274822,
+      "sampling/sampling_logp_difference/max": 5.3934197425842285,
+      "sampling/sampling_logp_difference/mean": 0.019704686477780342,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 1.4954135622247122e-05,
+      "clip_ratio/high_mean": 3.7385339055617806e-06,
+      "clip_ratio/low_mean": 3.632040886714094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0058942545329046e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15231.0,
+      "completions/mean_length": 5543.71875,
+      "completions/mean_terminated_length": 5283.55224609375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 0.9587382078170776,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016365943010896444,
+      "learning_rate": 1e-05,
+      "loss": 0.0057,
+      "num_tokens": 62335440.0,
+      "reward": 0.2421875,
+      "reward_std": 0.2964382767677307,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000062346458435,
+      "sampling/importance_sampling_ratio/min": 1.835696679108878e-07,
+      "sampling/sampling_logp_difference/max": 15.510671615600586,
+      "sampling/sampling_logp_difference/mean": 0.019060850143432617,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1255708386670449e-05,
+      "clip_ratio/high_mean": 2.813927096667612e-06,
+      "clip_ratio/low_mean": 1.205687783567555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.4870804704969487e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15514.0,
+      "completions/max_terminated_length": 15514.0,
+      "completions/mean_length": 5553.65625,
+      "completions/mean_terminated_length": 5553.65625,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "entropy": 1.0059658586978912,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028732717037200928,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 63071644.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3098035454750061,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000003457069397,
+      "sampling/importance_sampling_ratio/min": 0.0030927264597266912,
+      "sampling/sampling_logp_difference/max": 5.778702259063721,
+      "sampling/sampling_logp_difference/mean": 0.01885710284113884,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.669913806130353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.669913806130353e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15925.0,
+      "completions/mean_length": 5576.2265625,
+      "completions/mean_terminated_length": 5491.1259765625,
+      "completions/min_length": 62.0,
+      "completions/min_terminated_length": 62.0,
+      "entropy": 0.9912052825093269,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003957705572247505,
+      "learning_rate": 1e-05,
+      "loss": 0.0033,
+      "num_tokens": 63804529.0,
+      "reward": 0.2265625,
+      "reward_std": 0.23751860857009888,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998383522033691,
+      "sampling/importance_sampling_ratio/min": 0.0004883196670562029,
+      "sampling/sampling_logp_difference/max": 7.624540328979492,
+      "sampling/sampling_logp_difference/mean": 0.019657567143440247,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 7.340359388763318e-06,
+      "clip_ratio/high_mean": 1.8350898471908295e-06,
+      "clip_ratio/low_mean": 4.2495241643791815e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4330331377295806e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6716.9375,
+      "completions/mean_terminated_length": 6484.92822265625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.974421925842762,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027452034410089254,
+      "learning_rate": 1e-05,
+      "loss": -0.0238,
+      "num_tokens": 64684825.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998891949653625,
+      "sampling/importance_sampling_ratio/min": 0.00023439532378688455,
+      "sampling/sampling_logp_difference/max": 8.358501434326172,
+      "sampling/sampling_logp_difference/mean": 0.020278966054320335,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 1.1668500064843101e-05,
+      "clip_ratio/high_mean": 2.9171250162107754e-06,
+      "clip_ratio/low_mean": 2.278766351082595e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5704788185976213e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16172.0,
+      "completions/mean_length": 6033.609375,
+      "completions/mean_terminated_length": 5869.31787109375,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "entropy": 0.9376208484172821,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014822481898590922,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 65476055.0,
+      "reward": 0.28125,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999359846115112,
+      "sampling/importance_sampling_ratio/min": 0.0031867078505456448,
+      "sampling/sampling_logp_difference/max": 5.748766899108887,
+      "sampling/sampling_logp_difference/mean": 0.0203948225826025,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.2838053666873748e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2838053666873748e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15593.0,
+      "completions/mean_length": 6561.4453125,
+      "completions/mean_terminated_length": 6405.5322265625,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.8753902241587639,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016284709563478827,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 66335528.0,
+      "reward": 0.3125,
+      "reward_std": 0.28535234928131104,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999115467071533,
+      "sampling/importance_sampling_ratio/min": 7.897153409430757e-06,
+      "sampling/sampling_logp_difference/max": 11.749008178710938,
+      "sampling/sampling_logp_difference/mean": 0.01995038241147995,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.7495306085256743e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7495306085256743e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12974.0,
+      "completions/mean_length": 5322.03125,
+      "completions/mean_terminated_length": 5234.92919921875,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 0.9731436967849731,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004579639527946711,
+      "learning_rate": 1e-05,
+      "loss": 0.0111,
+      "num_tokens": 67036244.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2714630365371704,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000991821289062,
+      "sampling/importance_sampling_ratio/min": 0.00016946837422437966,
+      "sampling/sampling_logp_difference/max": 8.682844161987305,
+      "sampling/sampling_logp_difference/mean": 0.017986822873353958,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 9.390067589265527e-06,
+      "clip_ratio/high_mean": 2.347516897316382e-06,
+      "clip_ratio/low_mean": 2.9141255822651146e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.148877271996753e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 5428.1484375,
+      "completions/mean_terminated_length": 5254.24609375,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "entropy": 0.9560057744383812,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030614053830504417,
+      "learning_rate": 1e-05,
+      "loss": 0.0677,
+      "num_tokens": 67751911.0,
+      "reward": 0.40625,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998039603233337,
+      "sampling/importance_sampling_ratio/min": 0.00041119891102425754,
+      "sampling/sampling_logp_difference/max": 7.796433448791504,
+      "sampling/sampling_logp_difference/mean": 0.019884781911969185,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.3370414308155887e-05,
+      "clip_ratio/high_mean": 3.3426035770389717e-06,
+      "clip_ratio/low_mean": 2.5133818439826427e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.84764220168654e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16098.0,
+      "completions/mean_length": 6381.9140625,
+      "completions/mean_terminated_length": 6303.1572265625,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "entropy": 1.0577945485711098,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018679362256079912,
+      "learning_rate": 1e-05,
+      "loss": 0.0464,
+      "num_tokens": 68594620.0,
+      "reward": 0.1875,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000123977661133,
+      "sampling/importance_sampling_ratio/min": 7.031799759715796e-05,
+      "sampling/sampling_logp_difference/max": 9.562482833862305,
+      "sampling/sampling_logp_difference/mean": 0.019965168088674545,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 5.103707280795788e-06,
+      "clip_ratio/high_mean": 1.275926820198947e-06,
+      "clip_ratio/low_mean": 4.938993617997767e-05,
+      "clip_ratio/low_min": 4.324361725593917e-06,
+      "clip_ratio/region_mean": 5.06658626591161e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14491.0,
+      "completions/mean_length": 5626.5703125,
+      "completions/mean_terminated_length": 5455.81787109375,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 0.8880954682826996,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003913378342986107,
+      "learning_rate": 1e-05,
+      "loss": 0.078,
+      "num_tokens": 69335061.0,
+      "reward": 0.359375,
+      "reward_std": 0.4066115617752075,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001311302185,
+      "sampling/importance_sampling_ratio/min": 0.00010254964581690729,
+      "sampling/sampling_logp_difference/max": 9.185163497924805,
+      "sampling/sampling_logp_difference/mean": 0.018766846507787704,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 2.656613628460036e-05,
+      "clip_ratio/high_mean": 6.64153407115009e-06,
+      "clip_ratio/low_mean": 5.355309394872165e-05,
+      "clip_ratio/low_min": 6.923673481651349e-06,
+      "clip_ratio/region_mean": 6.019462853146251e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6252.5078125,
+      "completions/mean_terminated_length": 6172.732421875,
+      "completions/min_length": 583.0,
+      "completions/min_terminated_length": 583.0,
+      "entropy": 1.0409839749336243,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002942018210887909,
+      "learning_rate": 1e-05,
+      "loss": 0.0286,
+      "num_tokens": 70158806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30221226811408997,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998798370361328,
+      "sampling/importance_sampling_ratio/min": 0.00027446431340649724,
+      "sampling/sampling_logp_difference/max": 8.200689315795898,
+      "sampling/sampling_logp_difference/mean": 0.02092035487294197,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 1.0007204764406197e-05,
+      "clip_ratio/high_mean": 2.501801191101549e-06,
+      "clip_ratio/low_mean": 6.03029346848416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.280473587594315e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15687.0,
+      "completions/mean_length": 5936.171875,
+      "completions/mean_terminated_length": 5770.33349609375,
+      "completions/min_length": 614.0,
+      "completions/min_terminated_length": 614.0,
+      "entropy": 0.9782606735825539,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018363922135904431,
+      "learning_rate": 1e-05,
+      "loss": 0.0037,
+      "num_tokens": 70938108.0,
+      "reward": 0.296875,
+      "reward_std": 0.31824085116386414,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999080300331116,
+      "sampling/importance_sampling_ratio/min": 0.0001234232186106965,
+      "sampling/sampling_logp_difference/max": 8.99989128112793,
+      "sampling/sampling_logp_difference/mean": 0.02028634399175644,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 2.2271185798672377e-05,
+      "clip_ratio/high_mean": 5.567796449668094e-06,
+      "clip_ratio/low_mean": 2.026856623160711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.583636239705811e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15826.0,
+      "completions/mean_length": 5796.34375,
+      "completions/mean_terminated_length": 5712.9765625,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "entropy": 0.9343783929944038,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036520177964121103,
+      "learning_rate": 1e-05,
+      "loss": 0.0465,
+      "num_tokens": 71697904.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000145435333252,
+      "sampling/importance_sampling_ratio/min": 0.0013267943868413568,
+      "sampling/sampling_logp_difference/max": 6.6249895095825195,
+      "sampling/sampling_logp_difference/mean": 0.01939292624592781,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.3236602853794466e-05,
+      "clip_ratio/high_mean": 5.30995015424196e-06,
+      "clip_ratio/low_mean": 2.4116298618537257e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.942624860224896e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16075.0,
+      "completions/mean_length": 5912.5078125,
+      "completions/mean_terminated_length": 5746.2939453125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8880549967288971,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002131880959495902,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 72472657.0,
+      "reward": 0.484375,
+      "reward_std": 0.3027363121509552,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998900890350342,
+      "sampling/importance_sampling_ratio/min": 1.3350321736993465e-08,
+      "sampling/sampling_logp_difference/max": 18.131725311279297,
+      "sampling/sampling_logp_difference/mean": 0.019045043736696243,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 1.0632415978761856e-05,
+      "clip_ratio/high_mean": 2.658103994690464e-06,
+      "clip_ratio/low_mean": 3.596552733142744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.862363143980474e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14486.0,
+      "completions/mean_length": 5471.203125,
+      "completions/mean_terminated_length": 5385.275390625,
+      "completions/min_length": 757.0,
+      "completions/min_terminated_length": 757.0,
+      "entropy": 0.9127756953239441,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.0030769745353609324,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 73191403.0,
+      "reward": 0.5234375,
+      "reward_std": 0.4281895160675049,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999668598175049,
+      "sampling/importance_sampling_ratio/min": 1.3584097757757263e-07,
+      "sampling/sampling_logp_difference/max": 15.81178092956543,
+      "sampling/sampling_logp_difference/mean": 0.019179491326212883,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 6.134668183221947e-06,
+      "clip_ratio/high_mean": 1.5336670458054869e-06,
+      "clip_ratio/low_mean": 2.465653636818388e-05,
+      "clip_ratio/low_min": 3.4443801268935204e-06,
+      "clip_ratio/region_mean": 2.6190203413989366e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6366.5078125,
+      "completions/mean_terminated_length": 6207.50048828125,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 0.9889310300350189,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027727377600967884,
+      "learning_rate": 1e-05,
+      "loss": 0.011,
+      "num_tokens": 74026484.0,
+      "reward": 0.328125,
+      "reward_std": 0.3174794614315033,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998502731323242,
+      "sampling/importance_sampling_ratio/min": 0.00011932474444620311,
+      "sampling/sampling_logp_difference/max": 9.033661842346191,
+      "sampling/sampling_logp_difference/mean": 0.01946873590350151,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 1.3569412203651154e-05,
+      "clip_ratio/high_mean": 3.3923530509127886e-06,
+      "clip_ratio/low_mean": 2.118610348134098e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4578456645940605e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16312.0,
+      "completions/max_terminated_length": 16312.0,
+      "completions/mean_length": 4089.6015625,
+      "completions/mean_terminated_length": 4089.6015625,
+      "completions/min_length": 566.0,
+      "completions/min_terminated_length": 566.0,
+      "entropy": 0.8083604946732521,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003628374310210347,
+      "learning_rate": 1e-05,
+      "loss": -0.002,
+      "num_tokens": 74567833.0,
+      "reward": 0.484375,
+      "reward_std": 0.3174794614315033,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999944269657135,
+      "sampling/importance_sampling_ratio/min": 0.000612107920460403,
+      "sampling/sampling_logp_difference/max": 7.39860200881958,
+      "sampling/sampling_logp_difference/mean": 0.017995744943618774,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.947620376085979e-05,
+      "clip_ratio/high_mean": 5.989323312860506e-06,
+      "clip_ratio/low_mean": 2.8597964728760417e-05,
+      "clip_ratio/low_min": 7.570710295112804e-06,
+      "clip_ratio/region_mean": 3.458728804162092e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16340.0,
+      "completions/mean_length": 5678.7890625,
+      "completions/mean_terminated_length": 5508.865234375,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "entropy": 0.880424402654171,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004177773837000132,
+      "learning_rate": 1e-05,
+      "loss": 0.0595,
+      "num_tokens": 75314022.0,
+      "reward": 0.4765625,
+      "reward_std": 0.4105730950832367,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999686479568481,
+      "sampling/importance_sampling_ratio/min": 3.343528805999085e-05,
+      "sampling/sampling_logp_difference/max": 10.305898666381836,
+      "sampling/sampling_logp_difference/mean": 0.018467536196112633,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 1.4969179119361797e-05,
+      "clip_ratio/high_mean": 3.7422947798404493e-06,
+      "clip_ratio/low_mean": 5.1001184147025924e-05,
+      "clip_ratio/low_min": 7.801042556820903e-06,
+      "clip_ratio/region_mean": 5.474347858580586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5253.0234375,
+      "completions/mean_terminated_length": 5253.0234375,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "entropy": 0.9227524027228355,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015437579713761806,
+      "learning_rate": 1e-05,
+      "loss": 0.0445,
+      "num_tokens": 76005417.0,
+      "reward": 0.3515625,
+      "reward_std": 0.34586966037750244,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999125003814697,
+      "sampling/importance_sampling_ratio/min": 5.159151623956859e-05,
+      "sampling/sampling_logp_difference/max": 9.872153282165527,
+      "sampling/sampling_logp_difference/mean": 0.018250152468681335,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.3062932339380495e-05,
+      "clip_ratio/high_mean": 3.265733084845124e-06,
+      "clip_ratio/low_mean": 3.931676133106521e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2582495325405034e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15289.0,
+      "completions/mean_length": 5956.921875,
+      "completions/mean_terminated_length": 5533.056640625,
+      "completions/min_length": 606.0,
+      "completions/min_terminated_length": 606.0,
+      "entropy": 0.892315685749054,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019212538609281182,
+      "learning_rate": 1e-05,
+      "loss": 0.0688,
+      "num_tokens": 76787623.0,
+      "reward": 0.34375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999054074287415,
+      "sampling/importance_sampling_ratio/min": 0.0012463966850191355,
+      "sampling/sampling_logp_difference/max": 6.687498569488525,
+      "sampling/sampling_logp_difference/mean": 0.018439805135130882,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 2.714365291467402e-05,
+      "clip_ratio/high_mean": 6.785913228668505e-06,
+      "clip_ratio/low_mean": 3.920890912922914e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5994822471584484e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14715.0,
+      "completions/mean_length": 5575.09375,
+      "completions/mean_terminated_length": 5315.68017578125,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "entropy": 1.0225786119699478,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029739944729954004,
+      "learning_rate": 1e-05,
+      "loss": 0.0482,
+      "num_tokens": 77520091.0,
+      "reward": 0.3203125,
+      "reward_std": 0.29719969630241394,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 1.9004226032848237e-06,
+      "sampling/sampling_logp_difference/max": 13.173434257507324,
+      "sampling/sampling_logp_difference/mean": 0.020432481542229652,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.1180974752278416e-05,
+      "clip_ratio/high_mean": 2.795243688069604e-06,
+      "clip_ratio/low_mean": 5.534062506740156e-05,
+      "clip_ratio/low_min": 4.409326720633544e-06,
+      "clip_ratio/region_mean": 5.813586813019356e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16346.0,
+      "completions/mean_length": 7777.171875,
+      "completions/mean_terminated_length": 7499.5322265625,
+      "completions/min_length": 724.0,
+      "completions/min_terminated_length": 724.0,
+      "entropy": 0.8798429742455482,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021529686637222767,
+      "learning_rate": 1e-05,
+      "loss": 0.0963,
+      "num_tokens": 78538993.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3164186477661133,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998733401298523,
+      "sampling/importance_sampling_ratio/min": 2.081840648315847e-06,
+      "sampling/sampling_logp_difference/max": 13.082258224487305,
+      "sampling/sampling_logp_difference/mean": 0.019486568868160248,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 1.4091711364017101e-05,
+      "clip_ratio/high_mean": 3.5229278410042753e-06,
+      "clip_ratio/low_mean": 4.0216968045569956e-05,
+      "clip_ratio/low_min": 4.320475454733241e-06,
+      "clip_ratio/region_mean": 4.3739896682382096e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15763.0,
+      "completions/mean_length": 6298.4296875,
+      "completions/mean_terminated_length": 6219.015625,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 1.0422330349683762,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002485725563019514,
+      "learning_rate": 1e-05,
+      "loss": 0.0674,
+      "num_tokens": 79365144.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999951124191284,
+      "sampling/importance_sampling_ratio/min": 0.0008047395385801792,
+      "sampling/sampling_logp_difference/max": 7.1249918937683105,
+      "sampling/sampling_logp_difference/mean": 0.021251153200864792,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 5.182851054996718e-06,
+      "clip_ratio/high_mean": 1.2957127637491794e-06,
+      "clip_ratio/low_mean": 1.3408006566351105e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.4703719102726609e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13199.0,
+      "completions/max_terminated_length": 13199.0,
+      "completions/mean_length": 5001.8515625,
+      "completions/mean_terminated_length": 5001.8515625,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9210668653249741,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018336179200559855,
+      "learning_rate": 1e-05,
+      "loss": -0.0075,
+      "num_tokens": 80024661.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2969672679901123,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0015512153040617704,
+      "sampling/sampling_logp_difference/max": 6.468716621398926,
+      "sampling/sampling_logp_difference/mean": 0.018811997026205063,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 3.179798750352347e-05,
+      "clip_ratio/high_mean": 7.949496875880868e-06,
+      "clip_ratio/low_mean": 2.5010467197716935e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.29599640735978e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15916.0,
+      "completions/mean_length": 6280.1875,
+      "completions/mean_terminated_length": 6119.81005859375,
+      "completions/min_length": 426.0,
+      "completions/min_terminated_length": 426.0,
+      "entropy": 1.0198880061507225,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00276190135627985,
+      "learning_rate": 1e-05,
+      "loss": 0.0474,
+      "num_tokens": 80845941.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.00043450010707601905,
+      "sampling/sampling_logp_difference/max": 7.74131441116333,
+      "sampling/sampling_logp_difference/mean": 0.020783018320798874,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.0263617241434986e-05,
+      "clip_ratio/high_mean": 2.5659043103587464e-06,
+      "clip_ratio/low_mean": 2.2780154608881276e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.534605857817951e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14812.0,
+      "completions/mean_length": 5617.109375,
+      "completions/mean_terminated_length": 5358.7041015625,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0532233864068985,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020079545211046934,
+      "learning_rate": 1e-05,
+      "loss": 0.03,
+      "num_tokens": 81584099.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3037971258163452,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000622272491455,
+      "sampling/importance_sampling_ratio/min": 0.0014304202049970627,
+      "sampling/sampling_logp_difference/max": 6.5497870445251465,
+      "sampling/sampling_logp_difference/mean": 0.019330721348524094,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 3.592160510379472e-06,
+      "clip_ratio/high_mean": 8.98040127594868e-07,
+      "clip_ratio/low_mean": 2.2189478841028176e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3087518968623044e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15901.0,
+      "completions/mean_length": 4336.828125,
+      "completions/mean_terminated_length": 4241.96826171875,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.8131270706653595,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002346212510019541,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 82157581.0,
+      "reward": 0.59375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998981952667236,
+      "sampling/importance_sampling_ratio/min": 0.011126067489385605,
+      "sampling/sampling_logp_difference/max": 4.498464584350586,
+      "sampling/sampling_logp_difference/mean": 0.01748315989971161,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.621310563379666e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.621310563379666e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15911.0,
+      "completions/mean_length": 6185.1640625,
+      "completions/mean_terminated_length": 6023.2783203125,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "entropy": 0.9515878483653069,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020737929735332727,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 82970866.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229640007019,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999544024467468,
+      "sampling/importance_sampling_ratio/min": 0.00021864472364541143,
+      "sampling/sampling_logp_difference/max": 8.428062438964844,
+      "sampling/sampling_logp_difference/mean": 0.019794369116425514,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 2.830697485478595e-05,
+      "clip_ratio/high_mean": 7.076743713696487e-06,
+      "clip_ratio/low_mean": 3.404362587389187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1120369132841006e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15649.0,
+      "completions/mean_length": 6042.359375,
+      "completions/mean_terminated_length": 5960.92919921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9405315592885017,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013609385350719094,
+      "learning_rate": 1e-05,
+      "loss": 0.0023,
+      "num_tokens": 83762664.0,
+      "reward": 0.265625,
+      "reward_std": 0.2937847375869751,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000874996185303,
+      "sampling/importance_sampling_ratio/min": 0.03007127158343792,
+      "sampling/sampling_logp_difference/max": 3.5041849613189697,
+      "sampling/sampling_logp_difference/mean": 0.02063683047890663,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 2.4490228042850504e-05,
+      "clip_ratio/high_mean": 7.702277343923924e-06,
+      "clip_ratio/low_mean": 4.2714329822501895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.04166071095824e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16288.0,
+      "completions/mean_length": 7036.859375,
+      "completions/mean_terminated_length": 6963.259765625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9034569710493088,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017795560415834188,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 84684566.0,
+      "reward": 0.359375,
+      "reward_std": 0.2977414131164551,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000296831130981,
+      "sampling/importance_sampling_ratio/min": 0.03753140941262245,
+      "sampling/sampling_logp_difference/max": 3.2825770378112793,
+      "sampling/sampling_logp_difference/mean": 0.019494226202368736,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 2.028518520091893e-05,
+      "clip_ratio/high_mean": 6.102377255956526e-06,
+      "clip_ratio/low_mean": 3.518054700180073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.128292380300991e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16308.0,
+      "completions/mean_length": 6958.6484375,
+      "completions/mean_terminated_length": 6413.3798828125,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "entropy": 0.9195531085133553,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027138369623571634,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 85598345.0,
+      "reward": 0.421875,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999241828918457,
+      "sampling/importance_sampling_ratio/min": 0.0004585298302117735,
+      "sampling/sampling_logp_difference/max": 7.687485218048096,
+      "sampling/sampling_logp_difference/mean": 0.0201261006295681,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 7.460459528374486e-06,
+      "clip_ratio/high_mean": 3.464071141934255e-06,
+      "clip_ratio/low_mean": 3.825124849754502e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.171532009422663e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5773.890625,
+      "completions/mean_terminated_length": 5773.890625,
+      "completions/min_length": 792.0,
+      "completions/min_terminated_length": 792.0,
+      "entropy": 0.8253094777464867,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019655083306133747,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 86356403.0,
+      "reward": 0.390625,
+      "reward_std": 0.2635546922683716,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999909520149231,
+      "sampling/importance_sampling_ratio/min": 2.981063744300627e-06,
+      "sampling/sampling_logp_difference/max": 12.723230361938477,
+      "sampling/sampling_logp_difference/mean": 0.018150178715586662,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 7.937012014735956e-06,
+      "clip_ratio/high_mean": 1.984253003683989e-06,
+      "clip_ratio/low_mean": 4.778610400535399e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9770356781664304e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15584.0,
+      "completions/mean_length": 5233.546875,
+      "completions/mean_terminated_length": 4873.8544921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8463557213544846,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0024442693684250116,
+      "learning_rate": 1e-05,
+      "loss": 0.1172,
+      "num_tokens": 87043681.0,
+      "reward": 0.375,
+      "reward_std": 0.3987257480621338,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999265670776367,
+      "sampling/importance_sampling_ratio/min": 4.3303893448864983e-07,
+      "sampling/sampling_logp_difference/max": 14.652438163757324,
+      "sampling/sampling_logp_difference/mean": 0.01760055497288704,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 2.0049358681717422e-05,
+      "clip_ratio/high_mean": 6.392639988916926e-06,
+      "clip_ratio/low_mean": 2.7909350819754764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4301990581298014e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16070.0,
+      "completions/mean_length": 6098.5234375,
+      "completions/mean_terminated_length": 5851.67236328125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 0.9961429908871651,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001763843116350472,
+      "learning_rate": 1e-05,
+      "loss": 0.0279,
+      "num_tokens": 87845012.0,
+      "reward": 0.3125,
+      "reward_std": 0.24329747259616852,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999946117401123,
+      "sampling/importance_sampling_ratio/min": 0.0012967984657734632,
+      "sampling/sampling_logp_difference/max": 6.647856712341309,
+      "sampling/sampling_logp_difference/mean": 0.020430129021406174,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 6.041565939085558e-06,
+      "clip_ratio/high_mean": 1.5103914847713895e-06,
+      "clip_ratio/low_mean": 3.8537290720341844e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.004768220511323e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15364.0,
+      "completions/mean_length": 7306.828125,
+      "completions/mean_terminated_length": 6937.8369140625,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 1.0500907376408577,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023989977780729532,
+      "learning_rate": 1e-05,
+      "loss": 0.0383,
+      "num_tokens": 88799758.0,
+      "reward": 0.1875,
+      "reward_std": 0.23752352595329285,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998784065246582,
+      "sampling/importance_sampling_ratio/min": 0.00016530237917322665,
+      "sampling/sampling_logp_difference/max": 8.707734107971191,
+      "sampling/sampling_logp_difference/mean": 0.021274670958518982,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.1037226335683954e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1037226335683954e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15693.0,
+      "completions/mean_length": 5156.9765625,
+      "completions/mean_terminated_length": 4978.77001953125,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "entropy": 1.0691863298416138,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032527034636586905,
+      "learning_rate": 1e-05,
+      "loss": 0.1168,
+      "num_tokens": 89482459.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3406246304512024,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999943375587463,
+      "sampling/importance_sampling_ratio/min": 0.00010107864363817498,
+      "sampling/sampling_logp_difference/max": 9.19961166381836,
+      "sampling/sampling_logp_difference/mean": 0.019853606820106506,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 2.2721950699633453e-05,
+      "clip_ratio/high_mean": 5.680487674908363e-06,
+      "clip_ratio/low_mean": 4.0971160615299596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6651648517581634e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6804.8125,
+      "completions/mean_terminated_length": 6495.80615234375,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 0.867309644818306,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019014904974028468,
+      "learning_rate": 1e-05,
+      "loss": 0.0593,
+      "num_tokens": 90372587.0,
+      "reward": 0.375,
+      "reward_std": 0.34139877557754517,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999526143074036,
+      "sampling/importance_sampling_ratio/min": 0.00012341687397565693,
+      "sampling/sampling_logp_difference/max": 8.999942779541016,
+      "sampling/sampling_logp_difference/mean": 0.018908457830548286,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 1.0602929251035675e-05,
+      "clip_ratio/high_mean": 2.650732312758919e-06,
+      "clip_ratio/low_mean": 4.483750217332272e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.748823448608164e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15419.0,
+      "completions/max_terminated_length": 15419.0,
+      "completions/mean_length": 5354.2890625,
+      "completions/mean_terminated_length": 5354.2890625,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9092740416526794,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028308529872447252,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 91080912.0,
+      "reward": 0.3359375,
+      "reward_std": 0.34245961904525757,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000000238418579,
+      "sampling/importance_sampling_ratio/min": 0.003619713708758354,
+      "sampling/sampling_logp_difference/max": 5.6213603019714355,
+      "sampling/sampling_logp_difference/mean": 0.018408317118883133,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 7.076040446918341e-06,
+      "clip_ratio/high_mean": 1.7690101117295853e-06,
+      "clip_ratio/low_mean": 6.420628960768227e-05,
+      "clip_ratio/low_min": 9.37260915634397e-06,
+      "clip_ratio/region_mean": 6.59752995488816e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7653.1328125,
+      "completions/mean_terminated_length": 7371.49169921875,
+      "completions/min_length": 344.0,
+      "completions/min_terminated_length": 344.0,
+      "entropy": 0.9067098647356033,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026082738768309355,
+      "learning_rate": 1e-05,
+      "loss": 0.0373,
+      "num_tokens": 92080441.0,
+      "reward": 0.3125,
+      "reward_std": 0.3395638167858124,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999957084655762,
+      "sampling/importance_sampling_ratio/min": 3.7638976209564134e-05,
+      "sampling/sampling_logp_difference/max": 10.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.019849080592393875,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.642525709641632e-06,
+      "clip_ratio/high_mean": 1.8333832940697903e-06,
+      "clip_ratio/low_mean": 4.188668265214801e-05,
+      "clip_ratio/low_min": 6.032381861587055e-06,
+      "clip_ratio/region_mean": 4.3720065264096775e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 7864.796875,
+      "completions/mean_terminated_length": 7220.48779296875,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 1.0423363894224167,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001708728028461337,
+      "learning_rate": 1e-05,
+      "loss": 0.0394,
+      "num_tokens": 93107607.0,
+      "reward": 0.2265625,
+      "reward_std": 0.23933593928813934,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999992311000824,
+      "sampling/importance_sampling_ratio/min": 4.743846602650592e-06,
+      "sampling/sampling_logp_difference/max": 12.258662223815918,
+      "sampling/sampling_logp_difference/mean": 0.02070365846157074,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 6.424297680496238e-06,
+      "clip_ratio/high_mean": 1.6060744201240595e-06,
+      "clip_ratio/low_mean": 4.487338674152852e-05,
+      "clip_ratio/low_min": 7.803849257470574e-06,
+      "clip_ratio/region_mean": 4.647946116165258e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7690.6328125,
+      "completions/mean_terminated_length": 7622.18115234375,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 1.061365969479084,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026824623346328735,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 94111296.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2556639611721039,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998635649681091,
+      "sampling/importance_sampling_ratio/min": 0.00014029098383616656,
+      "sampling/sampling_logp_difference/max": 8.87179183959961,
+      "sampling/sampling_logp_difference/mean": 0.021192047744989395,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 5.478851562656928e-06,
+      "clip_ratio/high_mean": 1.369712890664232e-06,
+      "clip_ratio/low_mean": 1.5870192100919667e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.72399049915839e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15693.0,
+      "completions/mean_length": 5871.2265625,
+      "completions/mean_terminated_length": 5618.92041015625,
+      "completions/min_length": 126.0,
+      "completions/min_terminated_length": 126.0,
+      "entropy": 1.0346312001347542,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0012895551044493914,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 94883061.0,
+      "reward": 0.3125,
+      "reward_std": 0.16675156354904175,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999569654464722,
+      "sampling/importance_sampling_ratio/min": 0.007269685622304678,
+      "sampling/sampling_logp_difference/max": 4.924042224884033,
+      "sampling/sampling_logp_difference/mean": 0.02043779566884041,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 9.75199873209931e-06,
+      "clip_ratio/high_mean": 3.4236486499139573e-06,
+      "clip_ratio/low_mean": 3.807359871643712e-05,
+      "clip_ratio/low_min": 6.6283109845244326e-06,
+      "clip_ratio/region_mean": 4.1497247366351075e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15671.0,
+      "completions/mean_length": 7205.0703125,
+      "completions/mean_terminated_length": 6908.9755859375,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "entropy": 0.8426484614610672,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024157650768756866,
+      "learning_rate": 1e-05,
+      "loss": 0.0334,
+      "num_tokens": 95831798.0,
+      "reward": 0.3671875,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999579191207886,
+      "sampling/importance_sampling_ratio/min": 0.00780851487070322,
+      "sampling/sampling_logp_difference/max": 4.852540493011475,
+      "sampling/sampling_logp_difference/mean": 0.01930900476872921,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 7.827117542547057e-06,
+      "clip_ratio/high_mean": 1.9567793856367643e-06,
+      "clip_ratio/low_mean": 2.85506193904439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0507398662393825e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15557.0,
+      "completions/mean_length": 6770.2578125,
+      "completions/mean_terminated_length": 6539.5283203125,
+      "completions/min_length": 715.0,
+      "completions/min_terminated_length": 715.0,
+      "entropy": 0.8648517951369286,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018663652008399367,
+      "learning_rate": 1e-05,
+      "loss": 0.0353,
+      "num_tokens": 96716079.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3135277330875397,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147057533264,
+      "sampling/importance_sampling_ratio/min": 0.0013688995968550444,
+      "sampling/sampling_logp_difference/max": 6.593748092651367,
+      "sampling/sampling_logp_difference/mean": 0.019091933965682983,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 8.396982593694702e-06,
+      "clip_ratio/high_mean": 2.0992456484236754e-06,
+      "clip_ratio/low_mean": 3.30035152273922e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5102760875815875e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16138.0,
+      "completions/mean_length": 7880.8359375,
+      "completions/mean_terminated_length": 7745.86572265625,
+      "completions/min_length": 832.0,
+      "completions/min_terminated_length": 832.0,
+      "entropy": 0.9396157637238503,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016418134327977896,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 97744506.0,
+      "reward": 0.2109375,
+      "reward_std": 0.22225633263587952,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507069587708,
+      "sampling/importance_sampling_ratio/min": 0.0072977589443326,
+      "sampling/sampling_logp_difference/max": 4.920187950134277,
+      "sampling/sampling_logp_difference/mean": 0.02041018195450306,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.872459816671835e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.872459816671835e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16362.0,
+      "completions/mean_length": 6425.3515625,
+      "completions/mean_terminated_length": 6267.2783203125,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "entropy": 0.9397681280970573,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002677743323147297,
+      "learning_rate": 1e-05,
+      "loss": 0.0076,
+      "num_tokens": 98587647.0,
+      "reward": 0.359375,
+      "reward_std": 0.2567248046398163,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 5.40250198355352e-07,
+      "sampling/sampling_logp_difference/max": 14.431233406066895,
+      "sampling/sampling_logp_difference/mean": 0.020279735326766968,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.306506624132453e-05,
+      "clip_ratio/high_mean": 3.2662665603311325e-06,
+      "clip_ratio/low_mean": 3.8350387626451266e-05,
+      "clip_ratio/low_min": 9.45358260651119e-06,
+      "clip_ratio/region_mean": 4.161665401625214e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 7129.4609375,
+      "completions/mean_terminated_length": 6907.3525390625,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 1.1336064785718918,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032464349642395973,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 99522458.0,
+      "reward": 0.3046875,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999245405197144,
+      "sampling/importance_sampling_ratio/min": 0.0046671414747834206,
+      "sampling/sampling_logp_difference/max": 5.367208480834961,
+      "sampling/sampling_logp_difference/mean": 0.021748989820480347,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 9.463296464673476e-06,
+      "clip_ratio/high_mean": 2.365824116168369e-06,
+      "clip_ratio/low_mean": 3.497452934198009e-05,
+      "clip_ratio/low_min": 6.806807050452335e-06,
+      "clip_ratio/region_mean": 3.734035340130504e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15500.0,
+      "completions/mean_length": 7264.7421875,
+      "completions/mean_terminated_length": 7119.99267578125,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.8998278677463531,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026182979345321655,
+      "learning_rate": 1e-05,
+      "loss": 0.1161,
+      "num_tokens": 100474137.0,
+      "reward": 0.46875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000280141830444,
+      "sampling/importance_sampling_ratio/min": 0.021124430000782013,
+      "sampling/sampling_logp_difference/max": 3.8573250770568848,
+      "sampling/sampling_logp_difference/mean": 0.019057951867580414,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 8.944165074353805e-06,
+      "clip_ratio/high_mean": 2.236041268588451e-06,
+      "clip_ratio/low_mean": 4.6521246076736134e-05,
+      "clip_ratio/low_min": 7.112780167517485e-06,
+      "clip_ratio/region_mean": 4.875728745901142e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15064.0,
+      "completions/mean_length": 5473.71875,
+      "completions/mean_terminated_length": 5387.81103515625,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.9666230976581573,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0020499166566878557,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 101191861.0,
+      "reward": 0.328125,
+      "reward_std": 0.345874547958374,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999291896820068,
+      "sampling/importance_sampling_ratio/min": 1.8367816210229648e-06,
+      "sampling/sampling_logp_difference/max": 13.20749568939209,
+      "sampling/sampling_logp_difference/mean": 0.019896289333701134,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 2.054391302408476e-05,
+      "clip_ratio/high_mean": 5.13597825602119e-06,
+      "clip_ratio/low_mean": 6.0949954104216886e-05,
+      "clip_ratio/low_min": 1.2865434428022127e-05,
+      "clip_ratio/region_mean": 6.608593298551568e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 6679.9765625,
+      "completions/mean_terminated_length": 5946.05908203125,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.8775574564933777,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0024929519277065992,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 102070058.0,
+      "reward": 0.3671875,
+      "reward_std": 0.41398313641548157,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998810291290283,
+      "sampling/importance_sampling_ratio/min": 0.004311627708375454,
+      "sampling/sampling_logp_difference/max": 5.446439743041992,
+      "sampling/sampling_logp_difference/mean": 0.018816513940691948,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.7019791250259004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7019791250259004e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16345.0,
+      "completions/mean_length": 6549.0625,
+      "completions/mean_terminated_length": 6313.0244140625,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 0.8732621371746063,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002134882379323244,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 102926522.0,
+      "reward": 0.3828125,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000419616699219,
+      "sampling/importance_sampling_ratio/min": 0.0017044072737917304,
+      "sampling/sampling_logp_difference/max": 6.374537944793701,
+      "sampling/sampling_logp_difference/mean": 0.019951295107603073,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 3.6268677376938285e-06,
+      "clip_ratio/high_mean": 9.067169344234571e-07,
+      "clip_ratio/low_mean": 3.5008752547582844e-05,
+      "clip_ratio/low_min": 3.866736733471043e-06,
+      "clip_ratio/region_mean": 3.591546965253656e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16306.0,
+      "completions/mean_length": 6011.8359375,
+      "completions/mean_terminated_length": 5677.25,
+      "completions/min_length": 731.0,
+      "completions/min_terminated_length": 731.0,
+      "entropy": 0.9975898712873459,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037468743976205587,
+      "learning_rate": 1e-05,
+      "loss": 0.0818,
+      "num_tokens": 103714277.0,
+      "reward": 0.359375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000693798065186,
+      "sampling/importance_sampling_ratio/min": 0.002192396903410554,
+      "sampling/sampling_logp_difference/max": 6.122759819030762,
+      "sampling/sampling_logp_difference/mean": 0.019433926790952682,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 2.6430232992424862e-05,
+      "clip_ratio/high_mean": 6.607558248106216e-06,
+      "clip_ratio/low_mean": 3.3786116432565905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0393675021732633e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15027.0,
+      "completions/mean_length": 6270.203125,
+      "completions/mean_terminated_length": 6190.56689453125,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.7808161675930023,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035926424898207188,
+      "learning_rate": 1e-05,
+      "loss": 0.1162,
+      "num_tokens": 104537295.0,
+      "reward": 0.4921875,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999791383743286,
+      "sampling/importance_sampling_ratio/min": 0.00840076245367527,
+      "sampling/sampling_logp_difference/max": 4.779432773590088,
+      "sampling/sampling_logp_difference/mean": 0.017456334084272385,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.908255777991144e-05,
+      "clip_ratio/low_min": 7.643389835720882e-06,
+      "clip_ratio/region_mean": 4.908255777991144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14565.0,
+      "completions/mean_length": 4916.25,
+      "completions/mean_terminated_length": 4734.22265625,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.8354851230978966,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004259355366230011,
+      "learning_rate": 1e-05,
+      "loss": 0.0879,
+      "num_tokens": 105184551.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3656175136566162,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000231266021729,
+      "sampling/importance_sampling_ratio/min": 0.003178094746544957,
+      "sampling/sampling_logp_difference/max": 5.751473426818848,
+      "sampling/sampling_logp_difference/mean": 0.01745998114347458,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 6.184750873217126e-06,
+      "clip_ratio/high_mean": 2.3343936845776625e-06,
+      "clip_ratio/low_mean": 3.130356230940379e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.363795599398145e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14083.0,
+      "completions/mean_length": 5317.515625,
+      "completions/mean_terminated_length": 5230.3779296875,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.9808826446533203,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021007952746003866,
+      "learning_rate": 1e-05,
+      "loss": -0.0037,
+      "num_tokens": 105889289.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3151204586029053,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.004087196197360754,
+      "sampling/sampling_logp_difference/max": 5.499896049499512,
+      "sampling/sampling_logp_difference/mean": 0.020308660343289375,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 6.264094281505095e-06,
+      "clip_ratio/high_mean": 1.5660235703762737e-06,
+      "clip_ratio/low_mean": 4.276942695469188e-05,
+      "clip_ratio/low_min": 5.777519618277438e-06,
+      "clip_ratio/region_mean": 4.4335450525068154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16052.0,
+      "completions/mean_length": 7302.3671875,
+      "completions/mean_terminated_length": 6776.9833984375,
+      "completions/min_length": 81.0,
+      "completions/min_terminated_length": 81.0,
+      "entropy": 0.8526253402233124,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001218521734699607,
+      "learning_rate": 1e-05,
+      "loss": 0.0705,
+      "num_tokens": 106849048.0,
+      "reward": 0.28125,
+      "reward_std": 0.22331714630126953,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999129772186279,
+      "sampling/importance_sampling_ratio/min": 0.010783779434859753,
+      "sampling/sampling_logp_difference/max": 4.529712200164795,
+      "sampling/sampling_logp_difference/mean": 0.019228527322411537,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 1.1513777735672193e-05,
+      "clip_ratio/high_mean": 2.878444433918048e-06,
+      "clip_ratio/low_mean": 3.477262850992702e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7651072489097714e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14681.0,
+      "completions/mean_length": 4603.46875,
+      "completions/mean_terminated_length": 4510.70849609375,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.7025937959551811,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002826553536579013,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 107456676.0,
+      "reward": 0.625,
+      "reward_std": 0.35878273844718933,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932050704956,
+      "sampling/importance_sampling_ratio/min": 0.0006447202758863568,
+      "sampling/sampling_logp_difference/max": 7.346693992614746,
+      "sampling/sampling_logp_difference/mean": 0.016313642263412476,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 4.341936346463626e-06,
+      "clip_ratio/high_mean": 1.0854840866159066e-06,
+      "clip_ratio/low_mean": 4.9752483846532414e-05,
+      "clip_ratio/low_min": 1.0369344636274036e-05,
+      "clip_ratio/region_mean": 5.083796850158251e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16081.0,
+      "completions/mean_length": 7055.921875,
+      "completions/mean_terminated_length": 6755.01611328125,
+      "completions/min_length": 824.0,
+      "completions/min_terminated_length": 824.0,
+      "entropy": 0.8677415996789932,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015939075965434313,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 108380090.0,
+      "reward": 0.359375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999587535858154,
+      "sampling/importance_sampling_ratio/min": 0.007212483324110508,
+      "sampling/sampling_logp_difference/max": 4.931941986083984,
+      "sampling/sampling_logp_difference/mean": 0.019018646329641342,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.017062949264073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.017062949264073e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15374.0,
+      "completions/mean_length": 6947.546875,
+      "completions/mean_terminated_length": 6563.951171875,
+      "completions/min_length": 578.0,
+      "completions/min_terminated_length": 578.0,
+      "entropy": 0.9537070691585541,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014140130952000618,
+      "learning_rate": 1e-05,
+      "loss": 0.0685,
+      "num_tokens": 109288008.0,
+      "reward": 0.28125,
+      "reward_std": 0.35612428188323975,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532103538513,
+      "sampling/importance_sampling_ratio/min": 0.002557439962401986,
+      "sampling/sampling_logp_difference/max": 5.968748569488525,
+      "sampling/sampling_logp_difference/mean": 0.02024715766310692,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.4431375348067377e-05,
+      "clip_ratio/high_mean": 3.607843837016844e-06,
+      "clip_ratio/low_mean": 2.80186426380169e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.162648749821528e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16053.0,
+      "completions/mean_length": 5742.4140625,
+      "completions/mean_terminated_length": 5658.6220703125,
+      "completions/min_length": 952.0,
+      "completions/min_terminated_length": 952.0,
+      "entropy": 0.8954835087060928,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012910671066492796,
+      "learning_rate": 1e-05,
+      "loss": 0.0939,
+      "num_tokens": 110041333.0,
+      "reward": 0.4375,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 2.282886634930037e-05,
+      "sampling/sampling_logp_difference/max": 10.687484741210938,
+      "sampling/sampling_logp_difference/mean": 0.017754144966602325,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 3.2560687031946145e-05,
+      "clip_ratio/high_mean": 9.421434697287623e-06,
+      "clip_ratio/low_mean": 2.801389479145655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7435329431900755e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14702.0,
+      "completions/max_terminated_length": 14702.0,
+      "completions/mean_length": 5582.1640625,
+      "completions/mean_terminated_length": 5582.1640625,
+      "completions/min_length": 65.0,
+      "completions/min_terminated_length": 65.0,
+      "entropy": 0.9963158369064331,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002162793418392539,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 110775762.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999851584434509,
+      "sampling/importance_sampling_ratio/min": 0.0010016339365392923,
+      "sampling/sampling_logp_difference/max": 6.90612268447876,
+      "sampling/sampling_logp_difference/mean": 0.020483866333961487,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 1.746983889461262e-05,
+      "clip_ratio/high_mean": 7.333224402827909e-06,
+      "clip_ratio/low_mean": 3.6373660350363934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3706885207939195e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13832.0,
+      "completions/mean_length": 6047.8984375,
+      "completions/mean_terminated_length": 5883.83349609375,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.913147509098053,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00287337857298553,
+      "learning_rate": 1e-05,
+      "loss": 0.045,
+      "num_tokens": 111568589.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3453328609466553,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 9.964095625036862e-06,
+      "sampling/sampling_logp_difference/max": 11.516522407531738,
+      "sampling/sampling_logp_difference/mean": 0.018301380798220634,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 2.6439459361426998e-05,
+      "clip_ratio/high_mean": 6.6098648403567495e-06,
+      "clip_ratio/low_mean": 4.587054809235269e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.248041247796209e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14709.0,
+      "completions/mean_length": 6462.28125,
+      "completions/mean_terminated_length": 6224.16015625,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 1.1468544080853462,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017887315480038524,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 112414673.0,
+      "reward": 0.2734375,
+      "reward_std": 0.23592589795589447,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999889135360718,
+      "sampling/importance_sampling_ratio/min": 0.0007102306117303669,
+      "sampling/sampling_logp_difference/max": 7.249920845031738,
+      "sampling/sampling_logp_difference/mean": 0.021768372505903244,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 1.6320968370564515e-05,
+      "clip_ratio/high_mean": 5.031390969634231e-06,
+      "clip_ratio/low_mean": 3.567474152532668e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0706131812839885e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16126.0,
+      "completions/mean_length": 6897.0078125,
+      "completions/mean_terminated_length": 6822.30712890625,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 0.9793258458375931,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022704254370182753,
+      "learning_rate": 1e-05,
+      "loss": 0.0423,
+      "num_tokens": 113321722.0,
+      "reward": 0.2890625,
+      "reward_std": 0.34297874569892883,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000591278076172,
+      "sampling/importance_sampling_ratio/min": 5.476621663547121e-05,
+      "sampling/sampling_logp_difference/max": 9.812437057495117,
+      "sampling/sampling_logp_difference/mean": 0.020364979282021523,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 8.64622779772617e-06,
+      "clip_ratio/high_mean": 2.1615569494315423e-06,
+      "clip_ratio/low_mean": 4.702959677160834e-05,
+      "clip_ratio/low_min": 6.21032540948363e-06,
+      "clip_ratio/region_mean": 4.9191153607353044e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15914.0,
+      "completions/mean_length": 6779.7421875,
+      "completions/mean_terminated_length": 6307.4013671875,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9858463555574417,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022105660755187273,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 114210841.0,
+      "reward": 0.390625,
+      "reward_std": 0.3676722049713135,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 4.2232295527355745e-06,
+      "sampling/sampling_logp_difference/max": 12.374910354614258,
+      "sampling/sampling_logp_difference/mean": 0.021493885666131973,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 9.080286417884054e-06,
+      "clip_ratio/high_mean": 2.2700716044710134e-06,
+      "clip_ratio/low_mean": 3.73501702597423e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9620241750526475e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15145.0,
+      "completions/mean_length": 6204.34375,
+      "completions/mean_terminated_length": 5960.0322265625,
+      "completions/min_length": 771.0,
+      "completions/min_terminated_length": 771.0,
+      "entropy": 0.9073990881443024,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021019333507865667,
+      "learning_rate": 1e-05,
+      "loss": 0.0985,
+      "num_tokens": 115023469.0,
+      "reward": 0.4375,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999035596847534,
+      "sampling/importance_sampling_ratio/min": 7.850129009057127e-07,
+      "sampling/sampling_logp_difference/max": 14.057565689086914,
+      "sampling/sampling_logp_difference/mean": 0.019073951989412308,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 7.07747756223398e-05,
+      "clip_ratio/low_min": 6.719346401951043e-06,
+      "clip_ratio/region_mean": 7.07747756223398e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14484.0,
+      "completions/mean_length": 6382.890625,
+      "completions/mean_terminated_length": 5891.0322265625,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.8928572610020638,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002703179605305195,
+      "learning_rate": 1e-05,
+      "loss": 0.1215,
+      "num_tokens": 115860183.0,
+      "reward": 0.46875,
+      "reward_std": 0.3924228549003601,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999281764030457,
+      "sampling/importance_sampling_ratio/min": 0.002329134149476886,
+      "sampling/sampling_logp_difference/max": 6.062258720397949,
+      "sampling/sampling_logp_difference/mean": 0.018461842089891434,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.991344158293941e-05,
+      "clip_ratio/low_min": 4.287576302886009e-06,
+      "clip_ratio/region_mean": 3.991344158293941e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15934.0,
+      "completions/mean_length": 6856.25,
+      "completions/mean_terminated_length": 6387.671875,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "entropy": 0.9867237955331802,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025988349225372076,
+      "learning_rate": 1e-05,
+      "loss": 0.0191,
+      "num_tokens": 116757023.0,
+      "reward": 0.34375,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999343156814575,
+      "sampling/importance_sampling_ratio/min": 2.9312623155419715e-05,
+      "sampling/sampling_logp_difference/max": 10.437492370605469,
+      "sampling/sampling_logp_difference/mean": 0.019526638090610504,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.096957769661458e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.096957769661458e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15873.0,
+      "completions/mean_length": 6312.1328125,
+      "completions/mean_terminated_length": 5816.794921875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 0.8896873891353607,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0036364132538437843,
+      "learning_rate": 1e-05,
+      "loss": 0.0579,
+      "num_tokens": 117584064.0,
+      "reward": 0.2578125,
+      "reward_std": 0.3090519309043884,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998835325241089,
+      "sampling/importance_sampling_ratio/min": 0.0009706970304250717,
+      "sampling/sampling_logp_difference/max": 6.937496185302734,
+      "sampling/sampling_logp_difference/mean": 0.019127443432807922,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 3.0199071261449717e-06,
+      "clip_ratio/high_mean": 7.549767815362429e-07,
+      "clip_ratio/low_mean": 4.133729697741728e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.20922739863272e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16279.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 5875.625,
+      "completions/mean_terminated_length": 5875.625,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.9082999676465988,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025688125751912594,
+      "learning_rate": 1e-05,
+      "loss": 0.0737,
+      "num_tokens": 118354672.0,
+      "reward": 0.453125,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999657273292542,
+      "sampling/importance_sampling_ratio/min": 0.0024201429914683104,
+      "sampling/sampling_logp_difference/max": 6.023928642272949,
+      "sampling/sampling_logp_difference/mean": 0.019491348415613174,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 5.6563644648122136e-06,
+      "clip_ratio/high_mean": 1.4140911162030534e-06,
+      "clip_ratio/low_mean": 4.235651454109757e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.377060565730062e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13490.0,
+      "completions/mean_length": 6524.6015625,
+      "completions/mean_terminated_length": 6123.81298828125,
+      "completions/min_length": 362.0,
+      "completions/min_terminated_length": 362.0,
+      "entropy": 0.9052172750234604,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026063446421176195,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 119210997.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23751860857009888,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999611377716064,
+      "sampling/importance_sampling_ratio/min": 8.774310117587447e-06,
+      "sampling/sampling_logp_difference/max": 11.643682479858398,
+      "sampling/sampling_logp_difference/mean": 0.019871948286890984,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 2.8274008855078137e-05,
+      "clip_ratio/high_mean": 7.068502213769534e-06,
+      "clip_ratio/low_mean": 5.824237177876057e-05,
+      "clip_ratio/low_min": 9.362729997519637e-06,
+      "clip_ratio/region_mean": 6.531087387884327e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14731.0,
+      "completions/mean_length": 6606.34375,
+      "completions/mean_terminated_length": 6208.8779296875,
+      "completions/min_length": 1123.0,
+      "completions/min_terminated_length": 1123.0,
+      "entropy": 0.923908606171608,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002111563691869378,
+      "learning_rate": 1e-05,
+      "loss": 0.0834,
+      "num_tokens": 120076777.0,
+      "reward": 0.3359375,
+      "reward_std": 0.32879000902175903,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999362230300903,
+      "sampling/importance_sampling_ratio/min": 7.220578579492098e-10,
+      "sampling/sampling_logp_difference/max": 21.04891586303711,
+      "sampling/sampling_logp_difference/mean": 0.01944371685385704,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 2.226728611276485e-05,
+      "clip_ratio/high_mean": 6.534373824251816e-06,
+      "clip_ratio/low_mean": 2.137331728135905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7907691105610866e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 7156.2578125,
+      "completions/mean_terminated_length": 6934.79248046875,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "entropy": 1.0026871338486671,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002556675113737583,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 121013298.0,
+      "reward": 0.2890625,
+      "reward_std": 0.26013973355293274,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999322295188904,
+      "sampling/importance_sampling_ratio/min": 1.3007297638978343e-05,
+      "sampling/sampling_logp_difference/max": 11.25,
+      "sampling/sampling_logp_difference/mean": 0.02018606849014759,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 9.798196060728515e-06,
+      "clip_ratio/high_mean": 2.4495490151821286e-06,
+      "clip_ratio/low_mean": 6.042695122232544e-05,
+      "clip_ratio/low_min": 1.0388962436991278e-05,
+      "clip_ratio/region_mean": 6.287649966907338e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15184.0,
+      "completions/mean_length": 6177.3828125,
+      "completions/mean_terminated_length": 5848.13671875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.7995355725288391,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0032885256223380566,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 121820851.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35141900181770325,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.3007570487388875e-05,
+      "sampling/sampling_logp_difference/max": 11.249979019165039,
+      "sampling/sampling_logp_difference/mean": 0.018013037741184235,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.836798173826537e-05,
+      "clip_ratio/high_mean": 4.591995434566343e-06,
+      "clip_ratio/low_mean": 5.0241384542459855e-05,
+      "clip_ratio/low_min": 7.033341489659506e-06,
+      "clip_ratio/region_mean": 5.483338100020774e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15941.0,
+      "completions/mean_length": 6033.359375,
+      "completions/mean_terminated_length": 5612.6015625,
+      "completions/min_length": 551.0,
+      "completions/min_terminated_length": 551.0,
+      "entropy": 0.8770530596375465,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0035782051272690296,
+      "learning_rate": 1e-05,
+      "loss": 0.1015,
+      "num_tokens": 122615329.0,
+      "reward": 0.421875,
+      "reward_std": 0.3253750801086426,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000176429748535,
+      "sampling/importance_sampling_ratio/min": 8.344570233020931e-05,
+      "sampling/sampling_logp_difference/max": 9.391314506530762,
+      "sampling/sampling_logp_difference/mean": 0.018681444227695465,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 1.2653852763833129e-05,
+      "clip_ratio/high_mean": 4.80866970065108e-06,
+      "clip_ratio/low_mean": 3.11289915089219e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.593766109588614e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14860.0,
+      "completions/mean_length": 8237.46875,
+      "completions/mean_terminated_length": 7974.67724609375,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "entropy": 0.9543669074773788,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026586023159325123,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 123688709.0,
+      "reward": 0.328125,
+      "reward_std": 0.30327308177948,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228119850159,
+      "sampling/importance_sampling_ratio/min": 0.00017198453133460134,
+      "sampling/sampling_logp_difference/max": 8.668106079101562,
+      "sampling/sampling_logp_difference/mean": 0.020768223330378532,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 4.32630758950836e-06,
+      "clip_ratio/high_mean": 1.08157689737709e-06,
+      "clip_ratio/low_mean": 3.721513610344118e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.829671300081827e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15920.0,
+      "completions/mean_length": 6649.1015625,
+      "completions/mean_terminated_length": 6000.10888671875,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 0.8519875407218933,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028182135429233313,
+      "learning_rate": 1e-05,
+      "loss": 0.0528,
+      "num_tokens": 124557298.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999065399169922,
+      "sampling/importance_sampling_ratio/min": 6.050919910194352e-05,
+      "sampling/sampling_logp_difference/max": 9.712715148925781,
+      "sampling/sampling_logp_difference/mean": 0.019195500761270523,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.812353937377338e-06,
+      "clip_ratio/high_mean": 2.4530884843443346e-06,
+      "clip_ratio/low_mean": 1.864515820670931e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1098246747897065e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14946.0,
+      "completions/mean_length": 6262.125,
+      "completions/mean_terminated_length": 5587.33349609375,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "entropy": 0.9227473363280296,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0018271139124408364,
+      "learning_rate": 1e-05,
+      "loss": 0.0162,
+      "num_tokens": 125378002.0,
+      "reward": 0.421875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998780488967896,
+      "sampling/importance_sampling_ratio/min": 1.1365813179509132e-06,
+      "sampling/sampling_logp_difference/max": 13.687485694885254,
+      "sampling/sampling_logp_difference/mean": 0.018991345539689064,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 1.976754219867871e-05,
+      "clip_ratio/high_mean": 5.881085598957725e-06,
+      "clip_ratio/low_mean": 4.014476598968031e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6025852043385385e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16334.0,
+      "completions/mean_length": 6543.2734375,
+      "completions/mean_terminated_length": 6465.78759765625,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "entropy": 0.9931852892041206,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028531099669635296,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 126236133.0,
+      "reward": 0.2734375,
+      "reward_std": 0.3148259222507477,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000286102294922,
+      "sampling/importance_sampling_ratio/min": 1.9964969396824017e-05,
+      "sampling/sampling_logp_difference/max": 10.821531295776367,
+      "sampling/sampling_logp_difference/mean": 0.020335232838988304,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 2.1589371499430854e-05,
+      "clip_ratio/high_mean": 8.165637723323016e-06,
+      "clip_ratio/low_mean": 6.554757646881626e-05,
+      "clip_ratio/low_min": 5.570906523644226e-06,
+      "clip_ratio/region_mean": 7.371321362370509e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13107.0,
+      "completions/mean_length": 5567.2890625,
+      "completions/mean_terminated_length": 5482.1181640625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9842768535017967,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017482106341049075,
+      "learning_rate": 1e-05,
+      "loss": 0.0019,
+      "num_tokens": 126974666.0,
+      "reward": 0.25,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999868631362915,
+      "sampling/importance_sampling_ratio/min": 0.011517977342009544,
+      "sampling/sampling_logp_difference/max": 4.463846206665039,
+      "sampling/sampling_logp_difference/mean": 0.020022576674818993,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.0515780559217092e-05,
+      "clip_ratio/high_mean": 2.628945139804273e-06,
+      "clip_ratio/low_mean": 5.164334470464382e-05,
+      "clip_ratio/low_min": 3.369817250131746e-06,
+      "clip_ratio/region_mean": 5.427229007182177e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14865.0,
+      "completions/mean_length": 7232.6328125,
+      "completions/mean_terminated_length": 6937.42724609375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9599866047501564,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001637064153328538,
+      "learning_rate": 1e-05,
+      "loss": 0.0918,
+      "num_tokens": 127921331.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000075101852417,
+      "sampling/importance_sampling_ratio/min": 0.00023060032981447875,
+      "sampling/sampling_logp_difference/max": 8.374824523925781,
+      "sampling/sampling_logp_difference/mean": 0.01991824433207512,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.7373587070323993e-05,
+      "clip_ratio/high_mean": 4.343396767580998e-06,
+      "clip_ratio/low_mean": 2.182850187182339e-05,
+      "clip_ratio/low_min": 4.473072294786107e-06,
+      "clip_ratio/region_mean": 2.6171898525717552e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15075.0,
+      "completions/max_terminated_length": 15075.0,
+      "completions/mean_length": 4948.546875,
+      "completions/mean_terminated_length": 4948.546875,
+      "completions/min_length": 609.0,
+      "completions/min_terminated_length": 609.0,
+      "entropy": 0.9903113394975662,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00162114470731467,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 128575785.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999828040599823,
+      "sampling/importance_sampling_ratio/min": 3.263082589910482e-06,
+      "sampling/sampling_logp_difference/max": 12.632838249206543,
+      "sampling/sampling_logp_difference/mean": 0.019144343212246895,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 1.2063027497788426e-05,
+      "clip_ratio/high_mean": 4.366232360553113e-06,
+      "clip_ratio/low_mean": 3.965049324961001e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4016725382789446e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6205.234375,
+      "completions/mean_terminated_length": 6125.08642578125,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 0.9164782017469406,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0021650632843375206,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 129389191.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3214311897754669,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 0.0009118906455114484,
+      "sampling/sampling_logp_difference/max": 6.999990463256836,
+      "sampling/sampling_logp_difference/mean": 0.01929439604282379,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 2.6859754598262953e-05,
+      "clip_ratio/high_mean": 6.714938649565738e-06,
+      "clip_ratio/low_mean": 1.6451138890261063e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.31660775398268e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15741.0,
+      "completions/max_terminated_length": 15741.0,
+      "completions/mean_length": 4911.25,
+      "completions/mean_terminated_length": 4911.25,
+      "completions/min_length": 125.0,
+      "completions/min_terminated_length": 125.0,
+      "entropy": 0.9057909473776817,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019606768619269133,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 130036711.0,
+      "reward": 0.296875,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999849796295166,
+      "sampling/importance_sampling_ratio/min": 0.0008691518451087177,
+      "sampling/sampling_logp_difference/max": 7.047992706298828,
+      "sampling/sampling_logp_difference/mean": 0.020085586234927177,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.847699741119868e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.847699741119868e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15152.0,
+      "completions/mean_length": 6222.0859375,
+      "completions/mean_terminated_length": 5978.2001953125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.102900318801403,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0013436009176075459,
+      "learning_rate": 1e-05,
+      "loss": 0.0116,
+      "num_tokens": 130854714.0,
+      "reward": 0.21875,
+      "reward_std": 0.1825428307056427,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999322891235352,
+      "sampling/importance_sampling_ratio/min": 3.319984534755349e-05,
+      "sampling/sampling_logp_difference/max": 10.312965393066406,
+      "sampling/sampling_logp_difference/mean": 0.02261950448155403,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 1.0113483313034521e-05,
+      "clip_ratio/high_mean": 3.4217127904412337e-06,
+      "clip_ratio/low_mean": 3.916404375559068e-05,
+      "clip_ratio/low_min": 4.7332350732176565e-06,
+      "clip_ratio/region_mean": 4.258575745552662e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 6490.7734375,
+      "completions/mean_terminated_length": 6333.73828125,
+      "completions/min_length": 613.0,
+      "completions/min_terminated_length": 613.0,
+      "entropy": 0.9576810225844383,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025689650792628527,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 131703429.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3385029733181,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999434947967529,
+      "sampling/importance_sampling_ratio/min": 0.00037599547067657113,
+      "sampling/sampling_logp_difference/max": 7.8859333992004395,
+      "sampling/sampling_logp_difference/mean": 0.01931593380868435,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.780203212500055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.780203212500055e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14897.0,
+      "completions/mean_length": 6957.453125,
+      "completions/mean_terminated_length": 6653.37060546875,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "entropy": 0.9904302433133125,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002132449997588992,
+      "learning_rate": 1e-05,
+      "loss": 0.0848,
+      "num_tokens": 132614583.0,
+      "reward": 0.34375,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999384880065918,
+      "sampling/importance_sampling_ratio/min": 9.969094350026353e-08,
+      "sampling/sampling_logp_difference/max": 16.121191024780273,
+      "sampling/sampling_logp_difference/mean": 0.019748074933886528,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 1.6620725091343047e-05,
+      "clip_ratio/high_mean": 6.429913469219173e-06,
+      "clip_ratio/low_mean": 6.847188262781856e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.49017954149167e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15031.0,
+      "completions/mean_length": 6781.3828125,
+      "completions/mean_terminated_length": 6391.0322265625,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "entropy": 0.7702180370688438,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0037141458597034216,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 133500672.0,
+      "reward": 0.4140625,
+      "reward_std": 0.39294689893722534,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 0.0015879785642027855,
+      "sampling/sampling_logp_difference/max": 6.445293426513672,
+      "sampling/sampling_logp_difference/mean": 0.017618997022509575,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 8.414747526330757e-06,
+      "clip_ratio/high_mean": 2.1036868815826892e-06,
+      "clip_ratio/low_mean": 2.6748189156933222e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8851876209046168e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16383.0,
+      "completions/mean_length": 7167.6953125,
+      "completions/mean_terminated_length": 7095.1259765625,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 1.0333677157759666,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021144442725926638,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 134437361.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24671243131160736,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999521970748901,
+      "sampling/importance_sampling_ratio/min": 0.0020202873274683952,
+      "sampling/sampling_logp_difference/max": 6.20451545715332,
+      "sampling/sampling_logp_difference/mean": 0.021626941859722137,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 7.359868050116347e-06,
+      "clip_ratio/high_mean": 1.8399670125290868e-06,
+      "clip_ratio/low_mean": 3.642534238679218e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.826530939932127e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15035.0,
+      "completions/mean_length": 5934.9453125,
+      "completions/mean_terminated_length": 5684.16845703125,
+      "completions/min_length": 496.0,
+      "completions/min_terminated_length": 496.0,
+      "entropy": 0.8884351700544357,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025075129233300686,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 135215690.0,
+      "reward": 0.5078125,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000145435333252,
+      "sampling/importance_sampling_ratio/min": 8.12270229744172e-07,
+      "sampling/sampling_logp_difference/max": 14.023432731628418,
+      "sampling/sampling_logp_difference/mean": 0.018633443862199783,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 6.931506504770368e-06,
+      "clip_ratio/high_mean": 1.732876626192592e-06,
+      "clip_ratio/low_mean": 6.461201871843514e-05,
+      "clip_ratio/low_min": 9.272769602830522e-06,
+      "clip_ratio/region_mean": 6.634489625412243e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16331.0,
+      "completions/mean_length": 7267.296875,
+      "completions/mean_terminated_length": 7048.49609375,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 1.072906270623207,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0023191061336547136,
+      "learning_rate": 1e-05,
+      "loss": 0.1216,
+      "num_tokens": 136165880.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3400956988334656,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999949932098389,
+      "sampling/importance_sampling_ratio/min": 8.937300299294293e-05,
+      "sampling/sampling_logp_difference/max": 9.322691917419434,
+      "sampling/sampling_logp_difference/mean": 0.02122514694929123,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 7.245442930070567e-06,
+      "clip_ratio/high_mean": 1.8113607325176417e-06,
+      "clip_ratio/low_mean": 5.239449455984868e-05,
+      "clip_ratio/low_min": 7.146442158045829e-06,
+      "clip_ratio/region_mean": 5.420585534920974e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16230.0,
+      "completions/mean_length": 7433.1640625,
+      "completions/mean_terminated_length": 7362.68505859375,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "entropy": 1.0957217290997505,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029631280340254307,
+      "learning_rate": 1e-05,
+      "loss": 0.0352,
+      "num_tokens": 137140413.0,
+      "reward": 0.265625,
+      "reward_std": 0.28749164938926697,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999362230300903,
+      "sampling/importance_sampling_ratio/min": 0.0086804935708642,
+      "sampling/sampling_logp_difference/max": 4.746676921844482,
+      "sampling/sampling_logp_difference/mean": 0.022480733692646027,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 6.239364211069187e-06,
+      "clip_ratio/high_mean": 1.5598410527672968e-06,
+      "clip_ratio/low_mean": 3.690561521807467e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.846545632768539e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15985.0,
+      "completions/mean_length": 7073.90625,
+      "completions/mean_terminated_length": 6926.12744140625,
+      "completions/min_length": 1398.0,
+      "completions/min_terminated_length": 1398.0,
+      "entropy": 0.9333122596144676,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.000832411227747798,
+      "learning_rate": 1e-05,
+      "loss": 0.0312,
+      "num_tokens": 138064537.0,
+      "reward": 0.3671875,
+      "reward_std": 0.13888052105903625,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998854994773865,
+      "sampling/importance_sampling_ratio/min": 0.0002638234291225672,
+      "sampling/sampling_logp_difference/max": 8.240230560302734,
+      "sampling/sampling_logp_difference/mean": 0.019753674045205116,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.8504628946611774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8504628946611774e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15719.0,
+      "completions/mean_length": 5680.59375,
+      "completions/mean_terminated_length": 5596.31494140625,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 0.9720541462302208,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002570893382653594,
+      "learning_rate": 1e-05,
+      "loss": 0.0289,
+      "num_tokens": 138809293.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3703257441520691,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
+      "sampling/importance_sampling_ratio/min": 1.1064497584811761e-07,
+      "sampling/sampling_logp_difference/max": 16.016939163208008,
+      "sampling/sampling_logp_difference/mean": 0.019471734762191772,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.860648109821341e-05,
+      "clip_ratio/low_min": 6.799404218327254e-06,
+      "clip_ratio/region_mean": 3.860648109821341e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15983.0,
+      "completions/mean_length": 8024.34375,
+      "completions/mean_terminated_length": 7540.72705078125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 1.0136078596115112,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017353243893012404,
+      "learning_rate": 1e-05,
+      "loss": 0.0753,
+      "num_tokens": 139856281.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2551271915435791,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999872446060181,
+      "sampling/importance_sampling_ratio/min": 0.0012184304650872946,
+      "sampling/sampling_logp_difference/max": 6.71019172668457,
+      "sampling/sampling_logp_difference/mean": 0.021411728113889694,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 2.0505477323240484e-05,
+      "clip_ratio/high_mean": 5.126369330810121e-06,
+      "clip_ratio/low_mean": 5.543978954847262e-05,
+      "clip_ratio/low_min": 6.273411372603732e-06,
+      "clip_ratio/region_mean": 6.056615916349983e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 7543.96875,
+      "completions/mean_terminated_length": 7032.5615234375,
+      "completions/min_length": 747.0,
+      "completions/min_terminated_length": 747.0,
+      "entropy": 0.9921196177601814,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019490106496959925,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 140843861.0,
+      "reward": 0.296875,
+      "reward_std": 0.34717273712158203,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999728202819824,
+      "sampling/importance_sampling_ratio/min": 0.002482798881828785,
+      "sampling/sampling_logp_difference/max": 5.998368740081787,
+      "sampling/sampling_logp_difference/mean": 0.020561274141073227,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 2.1780562747153454e-05,
+      "clip_ratio/high_mean": 7.637661838089116e-06,
+      "clip_ratio/low_mean": 5.0004296554106986e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.76419583921961e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16320.0,
+      "completions/max_terminated_length": 16320.0,
+      "completions/mean_length": 6285.1796875,
+      "completions/mean_terminated_length": 6285.1796875,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.8724544793367386,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027221282944083214,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 141666372.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999271631240845,
+      "sampling/importance_sampling_ratio/min": 0.0001951520098373294,
+      "sampling/sampling_logp_difference/max": 8.541731834411621,
+      "sampling/sampling_logp_difference/mean": 0.01924072578549385,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 1.2773067282978445e-05,
+      "clip_ratio/high_mean": 3.1932668207446113e-06,
+      "clip_ratio/low_mean": 5.425560334515467e-05,
+      "clip_ratio/low_min": 8.365065696125384e-06,
+      "clip_ratio/region_mean": 5.744886925640458e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 7659.6796875,
+      "completions/mean_terminated_length": 7230.6142578125,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.9285296350717545,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0016997806960716844,
+      "learning_rate": 1e-05,
+      "loss": 0.0352,
+      "num_tokens": 142665635.0,
+      "reward": 0.328125,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 1.8975185867020627e-07,
+      "sampling/sampling_logp_difference/max": 15.477548599243164,
+      "sampling/sampling_logp_difference/mean": 0.020274491980671883,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 2.486542780388845e-05,
+      "clip_ratio/high_mean": 6.216356950972113e-06,
+      "clip_ratio/low_mean": 3.3204854901214276e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9421211965873226e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14834.0,
+      "completions/max_terminated_length": 14834.0,
+      "completions/mean_length": 5331.03125,
+      "completions/mean_terminated_length": 5331.03125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.7720941603183746,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030591271352022886,
+      "learning_rate": 1e-05,
+      "loss": -0.0544,
+      "num_tokens": 143364919.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 2.998966630585187e-09,
+      "sampling/sampling_logp_difference/max": 19.624998092651367,
+      "sampling/sampling_logp_difference/mean": 0.01690140925347805,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.0562233001110144e-05,
+      "clip_ratio/high_mean": 3.6131090155322454e-06,
+      "clip_ratio/low_mean": 5.028249574934307e-05,
+      "clip_ratio/low_min": 3.0328762932185782e-06,
+      "clip_ratio/region_mean": 5.3895605788056855e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15895.0,
+      "completions/mean_length": 7086.65625,
+      "completions/mean_terminated_length": 6708.71533203125,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "entropy": 0.8584504351019859,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0015365247381851077,
+      "learning_rate": 1e-05,
+      "loss": 0.0465,
+      "num_tokens": 144293867.0,
+      "reward": 0.2578125,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915791511536,
+      "sampling/importance_sampling_ratio/min": 0.00015850062482059002,
+      "sampling/sampling_logp_difference/max": 8.749752044677734,
+      "sampling/sampling_logp_difference/mean": 0.019430743530392647,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 6.546216354763601e-06,
+      "clip_ratio/high_mean": 1.6365540886909002e-06,
+      "clip_ratio/low_mean": 3.201156800969329e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.364812232575787e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 5455.6484375,
+      "completions/mean_terminated_length": 5369.5986328125,
+      "completions/min_length": 22.0,
+      "completions/min_terminated_length": 22.0,
+      "entropy": 0.8517125397920609,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003156432416290045,
+      "learning_rate": 1e-05,
+      "loss": 0.0352,
+      "num_tokens": 145013318.0,
+      "reward": 0.390625,
+      "reward_std": 0.25726157426834106,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000219345092773,
+      "sampling/importance_sampling_ratio/min": 0.10733240842819214,
+      "sampling/sampling_logp_difference/max": 2.2318246364593506,
+      "sampling/sampling_logp_difference/mean": 0.01860412396490574,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 4.192453593532264e-05,
+      "clip_ratio/high_mean": 1.196126476088466e-05,
+      "clip_ratio/low_mean": 4.6358243707800284e-05,
+      "clip_ratio/low_min": 5.576871444645803e-06,
+      "clip_ratio/region_mean": 5.8319507388659986e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 6670.2890625,
+      "completions/mean_terminated_length": 6192.5654296875,
+      "completions/min_length": 795.0,
+      "completions/min_terminated_length": 795.0,
+      "entropy": 0.8807757273316383,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028573600575327873,
+      "learning_rate": 1e-05,
+      "loss": 0.1163,
+      "num_tokens": 145886291.0,
+      "reward": 0.46875,
+      "reward_std": 0.38269224762916565,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 0.0006086408975534141,
+      "sampling/sampling_logp_difference/max": 7.404282093048096,
+      "sampling/sampling_logp_difference/mean": 0.01879466325044632,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 5.954649168415926e-06,
+      "clip_ratio/high_mean": 1.4886622921039816e-06,
+      "clip_ratio/low_mean": 2.10815471746173e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.257020946672128e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12881.0,
+      "completions/max_terminated_length": 12881.0,
+      "completions/mean_length": 5849.8359375,
+      "completions/mean_terminated_length": 5849.8359375,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 0.879327155649662,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028504019137471914,
+      "learning_rate": 1e-05,
+      "loss": 0.0731,
+      "num_tokens": 146658174.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2596206068992615,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999953508377075,
+      "sampling/importance_sampling_ratio/min": 0.0004885811940766871,
+      "sampling/sampling_logp_difference/max": 7.62400484085083,
+      "sampling/sampling_logp_difference/mean": 0.019282957538962364,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 1.0011702670453815e-05,
+      "clip_ratio/high_mean": 3.558776029422006e-06,
+      "clip_ratio/low_mean": 2.338160857107141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.694038448680658e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15885.0,
+      "completions/mean_length": 6376.7578125,
+      "completions/mean_terminated_length": 6297.96044921875,
+      "completions/min_length": 527.0,
+      "completions/min_terminated_length": 527.0,
+      "entropy": 1.0437361896038055,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026664668694138527,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 147494367.0,
+      "reward": 0.25,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999197721481323,
+      "sampling/importance_sampling_ratio/min": 5.43163696420379e-06,
+      "sampling/sampling_logp_difference/max": 12.123270034790039,
+      "sampling/sampling_logp_difference/mean": 0.020121946930885315,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 4.071263447258389e-06,
+      "clip_ratio/high_mean": 1.0178158618145972e-06,
+      "clip_ratio/low_mean": 5.679830292137922e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.781611889688065e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15314.0,
+      "completions/max_terminated_length": 15314.0,
+      "completions/mean_length": 6753.0390625,
+      "completions/mean_terminated_length": 6753.0390625,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.8704448491334915,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0013236560625955462,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 148377476.0,
+      "reward": 0.390625,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999928891658783,
+      "sampling/importance_sampling_ratio/min": 0.0005196586716920137,
+      "sampling/sampling_logp_difference/max": 7.562338352203369,
+      "sampling/sampling_logp_difference/mean": 0.019745871424674988,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.1118761626203195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1118761626203195e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14670.0,
+      "completions/mean_length": 6334.5625,
+      "completions/mean_terminated_length": 6255.43310546875,
+      "completions/min_length": 835.0,
+      "completions/min_terminated_length": 835.0,
+      "entropy": 0.9675566852092743,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003227849490940571,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 149213140.0,
+      "reward": 0.265625,
+      "reward_std": 0.22331714630126953,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805092811584,
+      "sampling/importance_sampling_ratio/min": 2.0039660739712417e-06,
+      "sampling/sampling_logp_difference/max": 13.120382308959961,
+      "sampling/sampling_logp_difference/mean": 0.02062838338315487,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 2.159174937332864e-05,
+      "clip_ratio/high_mean": 7.343517381741549e-06,
+      "clip_ratio/low_mean": 2.7624131234915694e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.496764873034408e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15878.0,
+      "completions/mean_length": 5986.3125,
+      "completions/mean_terminated_length": 5650.90283203125,
+      "completions/min_length": 482.0,
+      "completions/min_terminated_length": 482.0,
+      "entropy": 0.9257830232381821,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023177729453891516,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 149998732.0,
+      "reward": 0.4375,
+      "reward_std": 0.32589423656463623,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000395774841309,
+      "sampling/importance_sampling_ratio/min": 0.00015848006296437234,
+      "sampling/sampling_logp_difference/max": 8.749881744384766,
+      "sampling/sampling_logp_difference/mean": 0.018431315198540688,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 1.0338640322515857e-05,
+      "clip_ratio/high_mean": 2.5846600806289644e-06,
+      "clip_ratio/low_mean": 4.149641688400152e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.408107668041339e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15871.0,
+      "completions/mean_length": 7341.390625,
+      "completions/mean_terminated_length": 7049.693359375,
+      "completions/min_length": 789.0,
+      "completions/min_terminated_length": 789.0,
+      "entropy": 0.9617493599653244,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001992360921576619,
+      "learning_rate": 1e-05,
+      "loss": 0.0342,
+      "num_tokens": 150958414.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29119330644607544,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999643564224243,
+      "sampling/importance_sampling_ratio/min": 0.0011714966967701912,
+      "sampling/sampling_logp_difference/max": 6.7494730949401855,
+      "sampling/sampling_logp_difference/mean": 0.02040865272283554,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 1.402321640853188e-05,
+      "clip_ratio/high_mean": 4.2662558144002105e-06,
+      "clip_ratio/low_mean": 4.847697437071474e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.274322995774128e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15966.0,
+      "completions/mean_length": 6194.53125,
+      "completions/mean_terminated_length": 5605.0576171875,
+      "completions/min_length": 1022.0,
+      "completions/min_terminated_length": 1022.0,
+      "entropy": 0.7917485684156418,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002244317904114723,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 151770450.0,
+      "reward": 0.46875,
+      "reward_std": 0.29432153701782227,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999660849571228,
+      "sampling/importance_sampling_ratio/min": 0.0007107177516445518,
+      "sampling/sampling_logp_difference/max": 7.249235153198242,
+      "sampling/sampling_logp_difference/mean": 0.016992967575788498,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 1.0843792097148253e-05,
+      "clip_ratio/high_mean": 2.710948024287063e-06,
+      "clip_ratio/low_mean": 5.327871485860669e-05,
+      "clip_ratio/low_min": 8.019090955713182e-06,
+      "clip_ratio/region_mean": 5.598966299658059e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15010.0,
+      "completions/mean_length": 6883.328125,
+      "completions/mean_terminated_length": 6808.51953125,
+      "completions/min_length": 90.0,
+      "completions/min_terminated_length": 90.0,
+      "entropy": 0.8912994414567947,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028390102088451385,
+      "learning_rate": 1e-05,
+      "loss": 0.0662,
+      "num_tokens": 152668740.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3684907555580139,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999127388000488,
+      "sampling/importance_sampling_ratio/min": 0.00014138928963802755,
+      "sampling/sampling_logp_difference/max": 8.863993644714355,
+      "sampling/sampling_logp_difference/mean": 0.018673548474907875,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 1.0902768735832069e-05,
+      "clip_ratio/high_mean": 2.7256921839580173e-06,
+      "clip_ratio/low_mean": 3.64547792059966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918047127626778e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15506.0,
+      "completions/mean_length": 7799.5234375,
+      "completions/mean_terminated_length": 7227.2255859375,
+      "completions/min_length": 908.0,
+      "completions/min_terminated_length": 908.0,
+      "entropy": 0.81409652531147,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031472526025027037,
+      "learning_rate": 1e-05,
+      "loss": 0.0106,
+      "num_tokens": 153684919.0,
+      "reward": 0.265625,
+      "reward_std": 0.2924865484237671,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999836802482605,
+      "sampling/importance_sampling_ratio/min": 0.0033896781969815493,
+      "sampling/sampling_logp_difference/max": 5.687020301818848,
+      "sampling/sampling_logp_difference/mean": 0.020041968673467636,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 9.558767487760633e-06,
+      "clip_ratio/high_mean": 2.3896918719401583e-06,
+      "clip_ratio/low_mean": 2.064374041310657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.303343228504673e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14882.0,
+      "completions/max_terminated_length": 14882.0,
+      "completions/mean_length": 6441.78125,
+      "completions/mean_terminated_length": 6441.78125,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 1.0110936611890793,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0008370456052944064,
+      "learning_rate": 1e-05,
+      "loss": 0.0398,
+      "num_tokens": 154527195.0,
+      "reward": 0.3984375,
+      "reward_std": 0.14677615463733673,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999023079872131,
+      "sampling/importance_sampling_ratio/min": 0.00020978205429855734,
+      "sampling/sampling_logp_difference/max": 8.469441413879395,
+      "sampling/sampling_logp_difference/mean": 0.021425459533929825,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 4.3503982851689216e-06,
+      "clip_ratio/high_mean": 1.0875995712922304e-06,
+      "clip_ratio/low_mean": 2.6103265497567918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7190865182546986e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15901.0,
+      "completions/mean_length": 7140.2890625,
+      "completions/mean_terminated_length": 6918.4404296875,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.993028812110424,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004406601656228304,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 155457592.0,
+      "reward": 0.296875,
+      "reward_std": 0.24882915616035461,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998899698257446,
+      "sampling/importance_sampling_ratio/min": 0.005102821160107851,
+      "sampling/sampling_logp_difference/max": 5.277961730957031,
+      "sampling/sampling_logp_difference/mean": 0.020247166976332664,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.063482140281849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.063482140281849e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15280.0,
+      "completions/max_terminated_length": 15280.0,
+      "completions/mean_length": 6220.5703125,
+      "completions/mean_terminated_length": 6220.5703125,
+      "completions/min_length": 467.0,
+      "completions/min_terminated_length": 467.0,
+      "entropy": 0.9336734637618065,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0013446965022012591,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 156277609.0,
+      "reward": 0.3671875,
+      "reward_std": 0.32089442014694214,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0036465052980929613,
+      "sampling/sampling_logp_difference/max": 5.613986015319824,
+      "sampling/sampling_logp_difference/mean": 0.018678557127714157,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 1.0170509995077737e-05,
+      "clip_ratio/high_mean": 2.542627498769434e-06,
+      "clip_ratio/low_mean": 2.2835527090592223e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5378154816735332e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16143.0,
+      "completions/mean_length": 7230.3046875,
+      "completions/mean_terminated_length": 6935.02392578125,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.9315059334039688,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0007178800296969712,
+      "learning_rate": 1e-05,
+      "loss": 0.0817,
+      "num_tokens": 157222744.0,
+      "reward": 0.4453125,
+      "reward_std": 0.17517909407615662,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999822378158569,
+      "sampling/importance_sampling_ratio/min": 0.005948656238615513,
+      "sampling/sampling_logp_difference/max": 5.124589920043945,
+      "sampling/sampling_logp_difference/mean": 0.019229095429182053,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 8.961743105828646e-06,
+      "clip_ratio/high_mean": 2.2404357764571614e-06,
+      "clip_ratio/low_mean": 4.256807665115048e-05,
+      "clip_ratio/low_min": 4.9592349569138605e-06,
+      "clip_ratio/region_mean": 4.480851271182473e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15489.0,
+      "completions/mean_length": 7101.7890625,
+      "completions/mean_terminated_length": 6802.36279296875,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "entropy": 0.8410197496414185,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028408628422766924,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 158151901.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3237774670124054,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 1.1856438959512161e-06,
+      "sampling/sampling_logp_difference/max": 13.645224571228027,
+      "sampling/sampling_logp_difference/mean": 0.018435407429933548,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.0979279042876442e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0979279042876442e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15693.0,
+      "completions/mean_length": 6822.109375,
+      "completions/mean_terminated_length": 6670.33349609375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 0.9384881108999252,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003448180854320526,
+      "learning_rate": 1e-05,
+      "loss": 0.0354,
+      "num_tokens": 159043939.0,
+      "reward": 0.390625,
+      "reward_std": 0.2906692624092102,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 0.0018930588848888874,
+      "sampling/sampling_logp_difference/max": 6.269561290740967,
+      "sampling/sampling_logp_difference/mean": 0.01985720731317997,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 1.87569592071668e-05,
+      "clip_ratio/high_mean": 5.608627873243677e-06,
+      "clip_ratio/low_mean": 2.393421118540573e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.954283939970992e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16130.0,
+      "completions/mean_length": 6969.671875,
+      "completions/mean_terminated_length": 6665.98388671875,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "entropy": 0.8700083270668983,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002675072755664587,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 159955905.0,
+      "reward": 0.34375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 4.222963980282657e-06,
+      "sampling/sampling_logp_difference/max": 12.37497329711914,
+      "sampling/sampling_logp_difference/mean": 0.018493790179491043,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.0003448096540524e-05,
+      "clip_ratio/high_mean": 2.500862024135131e-06,
+      "clip_ratio/low_mean": 2.7816862200324977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0317724281303526e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16309.0,
+      "completions/mean_length": 6642.921875,
+      "completions/mean_terminated_length": 6409.13623046875,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "entropy": 1.0049321055412292,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034180639777332544,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 160825383.0,
+      "reward": 0.296875,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999150037765503,
+      "sampling/importance_sampling_ratio/min": 0.000667327141854912,
+      "sampling/sampling_logp_difference/max": 7.312230110168457,
+      "sampling/sampling_logp_difference/mean": 0.020563330501317978,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 5.628348844766151e-06,
+      "clip_ratio/high_mean": 1.4070872111915378e-06,
+      "clip_ratio/low_mean": 3.0009771876393643e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1416859314958856e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15930.0,
+      "completions/mean_length": 6327.296875,
+      "completions/mean_terminated_length": 6085.9365234375,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.8458633497357368,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016060187481343746,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 161653685.0,
+      "reward": 0.484375,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999157190322876,
+      "sampling/importance_sampling_ratio/min": 4.0065486246021464e-05,
+      "sampling/sampling_logp_difference/max": 10.124995231628418,
+      "sampling/sampling_logp_difference/mean": 0.018988098949193954,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.1031161648134002e-05,
+      "clip_ratio/high_mean": 2.7577904120335006e-06,
+      "clip_ratio/low_mean": 5.184456858842168e-05,
+      "clip_ratio/low_min": 3.209077931387583e-06,
+      "clip_ratio/region_mean": 5.460235854570783e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16008.0,
+      "completions/mean_length": 6871.4921875,
+      "completions/mean_terminated_length": 6643.1923828125,
+      "completions/min_length": 546.0,
+      "completions/min_terminated_length": 546.0,
+      "entropy": 0.8635450080037117,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027431908529251814,
+      "learning_rate": 1e-05,
+      "loss": 0.0519,
+      "num_tokens": 162555796.0,
+      "reward": 0.296875,
+      "reward_std": 0.2906692326068878,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999676942825317,
+      "sampling/importance_sampling_ratio/min": 1.8959757653647102e-05,
+      "sampling/sampling_logp_difference/max": 10.873191833496094,
+      "sampling/sampling_logp_difference/mean": 0.019010700285434723,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 1.122018943533476e-05,
+      "clip_ratio/high_mean": 2.80504735883369e-06,
+      "clip_ratio/low_mean": 3.166110184338322e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4466149031686655e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15032.0,
+      "completions/mean_length": 5741.7734375,
+      "completions/mean_terminated_length": 5657.9765625,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.820662334561348,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021551409736275673,
+      "learning_rate": 1e-05,
+      "loss": 0.0325,
+      "num_tokens": 163312831.0,
+      "reward": 0.3828125,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999495148658752,
+      "sampling/importance_sampling_ratio/min": 0.00020485777349676937,
+      "sampling/sampling_logp_difference/max": 8.493194580078125,
+      "sampling/sampling_logp_difference/mean": 0.018189631402492523,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 5.249454261502251e-06,
+      "clip_ratio/high_mean": 2.6246168545185355e-06,
+      "clip_ratio/low_mean": 5.6316800055356e-05,
+      "clip_ratio/low_min": 6.944251708773663e-06,
+      "clip_ratio/region_mean": 5.894141622775351e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15114.0,
+      "completions/max_terminated_length": 15114.0,
+      "completions/mean_length": 6707.234375,
+      "completions/mean_terminated_length": 6707.234375,
+      "completions/min_length": 1053.0,
+      "completions/min_terminated_length": 1053.0,
+      "entropy": 0.9361380413174629,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0021163856144994497,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 164189605.0,
+      "reward": 0.21875,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998763799667358,
+      "sampling/importance_sampling_ratio/min": 6.894206876495446e-07,
+      "sampling/sampling_logp_difference/max": 14.187414169311523,
+      "sampling/sampling_logp_difference/mean": 0.020120715722441673,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 1.2976960988453357e-05,
+      "clip_ratio/high_mean": 3.244240247113339e-06,
+      "clip_ratio/low_mean": 4.118970764466212e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.44339480054623e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15672.0,
+      "completions/mean_length": 7074.59375,
+      "completions/mean_terminated_length": 6774.2900390625,
+      "completions/min_length": 987.0,
+      "completions/min_terminated_length": 987.0,
+      "entropy": 0.9206110090017319,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003191466676071286,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 165114649.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999928891658783,
+      "sampling/importance_sampling_ratio/min": 0.0015704745892435312,
+      "sampling/sampling_logp_difference/max": 6.4563775062561035,
+      "sampling/sampling_logp_difference/mean": 0.020029421895742416,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 2.4998532580866595e-05,
+      "clip_ratio/high_mean": 6.947302438220504e-06,
+      "clip_ratio/low_mean": 4.305635661694396e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.000365831620002e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15481.0,
+      "completions/mean_length": 6510.3984375,
+      "completions/mean_terminated_length": 6432.6533203125,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "entropy": 0.9344880431890488,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002458518138155341,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 165971100.0,
+      "reward": 0.484375,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999246597290039,
+      "sampling/importance_sampling_ratio/min": 0.0011708823731169105,
+      "sampling/sampling_logp_difference/max": 6.749997615814209,
+      "sampling/sampling_logp_difference/mean": 0.02032654918730259,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 1.9761582279897993e-05,
+      "clip_ratio/high_mean": 4.940395569974498e-06,
+      "clip_ratio/low_mean": 2.598603293790802e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.092642862156936e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16364.0,
+      "completions/max_terminated_length": 16364.0,
+      "completions/mean_length": 5363.4609375,
+      "completions/mean_terminated_length": 5363.4609375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "entropy": 0.8528282344341278,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020360907074064016,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 166676943.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911470413208,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0005493607022799551,
+      "sampling/sampling_logp_difference/max": 7.506755352020264,
+      "sampling/sampling_logp_difference/mean": 0.01911250874400139,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 6.622867658734322e-06,
+      "clip_ratio/high_mean": 1.6557169146835804e-06,
+      "clip_ratio/low_mean": 4.006644434184636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.172216131337336e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14735.0,
+      "completions/mean_length": 4550.203125,
+      "completions/mean_terminated_length": 4266.1923828125,
+      "completions/min_length": 561.0,
+      "completions/min_terminated_length": 561.0,
+      "entropy": 0.7535714656114578,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015881177969276905,
+      "learning_rate": 1e-05,
+      "loss": 0.0952,
+      "num_tokens": 167278489.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999875009059906,
+      "sampling/importance_sampling_ratio/min": 7.485204696422443e-05,
+      "sampling/sampling_logp_difference/max": 9.49999713897705,
+      "sampling/sampling_logp_difference/mean": 0.016919689252972603,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 2.8397119422152173e-05,
+      "clip_ratio/high_mean": 7.099279855538043e-06,
+      "clip_ratio/low_mean": 2.2654034410152235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9753314493063954e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16087.0,
+      "completions/mean_length": 5080.078125,
+      "completions/mean_terminated_length": 4991.07080078125,
+      "completions/min_length": 684.0,
+      "completions/min_terminated_length": 684.0,
+      "entropy": 0.922355130314827,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021621519699692726,
+      "learning_rate": 1e-05,
+      "loss": 0.0634,
+      "num_tokens": 167949827.0,
+      "reward": 0.5546875,
+      "reward_std": 0.21829968690872192,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998918771743774,
+      "sampling/importance_sampling_ratio/min": 9.328075248049572e-05,
+      "sampling/sampling_logp_difference/max": 9.27989673614502,
+      "sampling/sampling_logp_difference/mean": 0.018358757719397545,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 1.3618362117995275e-05,
+      "clip_ratio/high_mean": 4.41220004177012e-06,
+      "clip_ratio/low_mean": 6.229132804946858e-05,
+      "clip_ratio/low_min": 1.1466368505352875e-05,
+      "clip_ratio/region_mean": 6.670352740911767e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15665.0,
+      "completions/max_terminated_length": 15665.0,
+      "completions/mean_length": 6371.9453125,
+      "completions/mean_terminated_length": 6371.9453125,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.8835635632276535,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.003488079411908984,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 168781948.0,
+      "reward": 0.46875,
+      "reward_std": 0.4673760235309601,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999973773956299,
+      "sampling/importance_sampling_ratio/min": 4.154009047852014e-08,
+      "sampling/sampling_logp_difference/max": 16.996606826782227,
+      "sampling/sampling_logp_difference/mean": 0.01854466274380684,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 1.3789490822091466e-05,
+      "clip_ratio/high_mean": 3.4473727055228665e-06,
+      "clip_ratio/low_mean": 3.9819827861720114e-05,
+      "clip_ratio/low_min": 9.205373771692393e-06,
+      "clip_ratio/region_mean": 4.3267199771435116e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15957.0,
+      "completions/mean_length": 7045.234375,
+      "completions/mean_terminated_length": 6665.609375,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 0.8657141029834747,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002579214284196496,
+      "learning_rate": 1e-05,
+      "loss": 0.0787,
+      "num_tokens": 169704370.0,
+      "reward": 0.390625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999009370803833,
+      "sampling/importance_sampling_ratio/min": 0.00038033726741559803,
+      "sampling/sampling_logp_difference/max": 7.874452114105225,
+      "sampling/sampling_logp_difference/mean": 0.020650357007980347,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.0065672540804371e-05,
+      "clip_ratio/high_mean": 2.516418135201093e-06,
+      "clip_ratio/low_mean": 2.5041783715096244e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7558201850297337e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13301.0,
+      "completions/mean_length": 4835.1015625,
+      "completions/mean_terminated_length": 4744.16552734375,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "entropy": 0.8166600242257118,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015265591209754348,
+      "learning_rate": 1e-05,
+      "loss": 0.0399,
+      "num_tokens": 170343191.0,
+      "reward": 0.4765625,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999908983707428,
+      "sampling/importance_sampling_ratio/min": 0.0008047395385801792,
+      "sampling/sampling_logp_difference/max": 7.1249918937683105,
+      "sampling/sampling_logp_difference/mean": 0.01807256042957306,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.965024677654583e-05,
+      "clip_ratio/low_min": 3.7946631437080214e-06,
+      "clip_ratio/region_mean": 3.965024677654583e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 6042.6328125,
+      "completions/mean_terminated_length": 5622.251953125,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 0.8976519927382469,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019487867830321193,
+      "learning_rate": 1e-05,
+      "loss": 0.1108,
+      "num_tokens": 171136048.0,
+      "reward": 0.3828125,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.0011446340940892696,
+      "sampling/sampling_logp_difference/max": 6.772670269012451,
+      "sampling/sampling_logp_difference/mean": 0.019680369645357132,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 5.620756382995751e-06,
+      "clip_ratio/high_mean": 1.4051890957489377e-06,
+      "clip_ratio/low_mean": 4.3911951024711016e-05,
+      "clip_ratio/low_min": 3.7100794543221127e-06,
+      "clip_ratio/region_mean": 4.531714012045995e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16298.0,
+      "completions/mean_length": 6418.3359375,
+      "completions/mean_terminated_length": 6339.8662109375,
+      "completions/min_length": 763.0,
+      "completions/min_terminated_length": 763.0,
+      "entropy": 0.8599612265825272,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018101281020790339,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 171976483.0,
+      "reward": 0.390625,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999486207962036,
+      "sampling/importance_sampling_ratio/min": 4.0352391806663945e-05,
+      "sampling/sampling_logp_difference/max": 10.117859840393066,
+      "sampling/sampling_logp_difference/mean": 0.01834172010421753,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 8.747987521928735e-06,
+      "clip_ratio/high_mean": 2.1869968804821838e-06,
+      "clip_ratio/low_mean": 1.736767285365204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9554669734134222e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15075.0,
+      "completions/mean_length": 5835.1484375,
+      "completions/mean_terminated_length": 5752.08642578125,
+      "completions/min_length": 561.0,
+      "completions/min_terminated_length": 561.0,
+      "entropy": 0.930196188390255,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0009842904983088374,
+      "learning_rate": 1e-05,
+      "loss": 0.0174,
+      "num_tokens": 172743158.0,
+      "reward": 0.3515625,
+      "reward_std": 0.12863078713417053,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000227689743042,
+      "sampling/importance_sampling_ratio/min": 0.02929825149476528,
+      "sampling/sampling_logp_difference/max": 3.5302274227142334,
+      "sampling/sampling_logp_difference/mean": 0.020194582641124725,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.4560856420284836e-05,
+      "clip_ratio/high_mean": 1.2245807511135354e-05,
+      "clip_ratio/low_mean": 4.938034498991328e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.162615136418026e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15807.0,
+      "completions/mean_length": 4960.5234375,
+      "completions/mean_terminated_length": 4870.57470703125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "entropy": 0.7726479545235634,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0032878813799470663,
+      "learning_rate": 1e-05,
+      "loss": -0.0492,
+      "num_tokens": 173400993.0,
+      "reward": 0.5,
+      "reward_std": 0.3924228549003601,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999999403953552,
+      "sampling/importance_sampling_ratio/min": 1.9806284399237484e-06,
+      "sampling/sampling_logp_difference/max": 13.132096290588379,
+      "sampling/sampling_logp_difference/mean": 0.018239401280879974,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 9.530344868835527e-06,
+      "clip_ratio/high_mean": 2.382586217208882e-06,
+      "clip_ratio/low_mean": 1.8789201192248584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1171787466300884e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15588.0,
+      "completions/max_terminated_length": 15588.0,
+      "completions/mean_length": 6778.453125,
+      "completions/mean_terminated_length": 6778.453125,
+      "completions/min_length": 709.0,
+      "completions/min_terminated_length": 709.0,
+      "entropy": 0.9891144260764122,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021506824996322393,
+      "learning_rate": 1e-05,
+      "loss": 0.0872,
+      "num_tokens": 174286163.0,
+      "reward": 0.3203125,
+      "reward_std": 0.23910348117351532,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00002121925354,
+      "sampling/importance_sampling_ratio/min": 3.8179036891961005e-06,
+      "sampling/sampling_logp_difference/max": 12.475809097290039,
+      "sampling/sampling_logp_difference/mean": 0.019467821344733238,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.731942322498071e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.731942322498071e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16363.0,
+      "completions/mean_length": 7835.8203125,
+      "completions/mean_terminated_length": 7768.51171875,
+      "completions/min_length": 282.0,
+      "completions/min_terminated_length": 282.0,
+      "entropy": 1.1394712179899216,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0019394620321691036,
+      "learning_rate": 1e-05,
+      "loss": 0.0144,
+      "num_tokens": 175314884.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999979138374329,
+      "sampling/importance_sampling_ratio/min": 0.0006493349210359156,
+      "sampling/sampling_logp_difference/max": 7.339561939239502,
+      "sampling/sampling_logp_difference/mean": 0.02314554899930954,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 2.6689051992434543e-05,
+      "clip_ratio/high_mean": 1.0311606502000359e-05,
+      "clip_ratio/low_mean": 4.749879690280068e-05,
+      "clip_ratio/low_min": 1.1613257356657414e-05,
+      "clip_ratio/region_mean": 5.781040522379044e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15977.0,
+      "completions/max_terminated_length": 15977.0,
+      "completions/mean_length": 6552.640625,
+      "completions/mean_terminated_length": 6552.640625,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "entropy": 0.9301942139863968,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029180990532040596,
+      "learning_rate": 1e-05,
+      "loss": 0.0895,
+      "num_tokens": 176170070.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3527093529701233,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000029802322388,
+      "sampling/importance_sampling_ratio/min": 0.004631850868463516,
+      "sampling/sampling_logp_difference/max": 5.374798774719238,
+      "sampling/sampling_logp_difference/mean": 0.01968369632959366,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 6.5973504206340294e-06,
+      "clip_ratio/high_mean": 1.6493376051585074e-06,
+      "clip_ratio/low_mean": 3.3509465310999076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.515880302984442e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 6035.296875,
+      "completions/mean_terminated_length": 5953.81103515625,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "entropy": 0.9439655765891075,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0013513187877833843,
+      "learning_rate": 1e-05,
+      "loss": 0.0062,
+      "num_tokens": 176962084.0,
+      "reward": 0.453125,
+      "reward_std": 0.23645779490470886,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000133514404297,
+      "sampling/importance_sampling_ratio/min": 7.028038817225024e-05,
+      "sampling/sampling_logp_difference/max": 9.563017845153809,
+      "sampling/sampling_logp_difference/mean": 0.020156048238277435,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 4.21926688431995e-06,
+      "clip_ratio/high_mean": 1.0548167210799875e-06,
+      "clip_ratio/low_mean": 3.7025285053005064e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8080101546711376e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15470.0,
+      "completions/mean_length": 7192.4296875,
+      "completions/mean_terminated_length": 6895.92724609375,
+      "completions/min_length": 703.0,
+      "completions/min_terminated_length": 703.0,
+      "entropy": 0.8545770645141602,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035121457185596228,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 177901579.0,
+      "reward": 0.328125,
+      "reward_std": 0.30221715569496155,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998663663864136,
+      "sampling/importance_sampling_ratio/min": 0.000296071550110355,
+      "sampling/sampling_logp_difference/max": 8.124909400939941,
+      "sampling/sampling_logp_difference/mean": 0.018486706539988518,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 3.974942046625074e-06,
+      "clip_ratio/high_mean": 9.937355116562685e-07,
+      "clip_ratio/low_mean": 3.2998319056787295e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.399205434106989e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 6525.328125,
+      "completions/mean_terminated_length": 6124.56884765625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.8625697493553162,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002456578193232417,
+      "learning_rate": 1e-05,
+      "loss": 0.0748,
+      "num_tokens": 178756773.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999394416809082,
+      "sampling/importance_sampling_ratio/min": 0.0001488614798290655,
+      "sampling/sampling_logp_difference/max": 8.812494277954102,
+      "sampling/sampling_logp_difference/mean": 0.018010437488555908,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 1.2826577403757256e-05,
+      "clip_ratio/high_mean": 4.401672981657612e-06,
+      "clip_ratio/low_mean": 7.05404337395521e-05,
+      "clip_ratio/low_min": 1.734040552037186e-05,
+      "clip_ratio/region_mean": 7.494210694858339e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14933.0,
+      "completions/mean_length": 7227.640625,
+      "completions/mean_terminated_length": 6932.27392578125,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.7740364670753479,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003040029900148511,
+      "learning_rate": 1e-05,
+      "loss": 0.1685,
+      "num_tokens": 179700639.0,
+      "reward": 0.515625,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9996599555015564,
+      "sampling/importance_sampling_ratio/min": 3.1452334496862022e-06,
+      "sampling/sampling_logp_difference/max": 12.669622421264648,
+      "sampling/sampling_logp_difference/mean": 0.018948577344417572,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 7.97244683781173e-06,
+      "clip_ratio/high_mean": 1.9931117094529327e-06,
+      "clip_ratio/low_mean": 2.7227763212067657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.922087492152059e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15639.0,
+      "completions/mean_length": 7019.4375,
+      "completions/mean_terminated_length": 6870.7939453125,
+      "completions/min_length": 427.0,
+      "completions/min_terminated_length": 427.0,
+      "entropy": 0.9501559659838676,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001853659632615745,
+      "learning_rate": 1e-05,
+      "loss": 0.0498,
+      "num_tokens": 180615847.0,
+      "reward": 0.390625,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999617338180542,
+      "sampling/importance_sampling_ratio/min": 0.0061973449774086475,
+      "sampling/sampling_logp_difference/max": 5.083634376525879,
+      "sampling/sampling_logp_difference/mean": 0.021023310720920563,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.039616189606022e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.039616189606022e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16384.0,
+      "completions/mean_length": 6705.03125,
+      "completions/mean_terminated_length": 6229.01611328125,
+      "completions/min_length": 1130.0,
+      "completions/min_terminated_length": 1130.0,
+      "entropy": 0.9054799973964691,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014863376272842288,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 181493971.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2396402806043625,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999639987945557,
+      "sampling/importance_sampling_ratio/min": 0.0023789836559444666,
+      "sampling/sampling_logp_difference/max": 6.04108190536499,
+      "sampling/sampling_logp_difference/mean": 0.019701875746250153,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.4479510582532384e-05,
+      "clip_ratio/high_mean": 3.619877645633096e-06,
+      "clip_ratio/low_mean": 2.6611398709519563e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0231276070935564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 5421.390625,
+      "completions/mean_terminated_length": 5421.390625,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.9483538940548897,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0039733098819851875,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 182208309.0,
+      "reward": 0.484375,
+      "reward_std": 0.309583842754364,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999675154685974,
+      "sampling/importance_sampling_ratio/min": 0.011960627511143684,
+      "sampling/sampling_logp_difference/max": 5.5837554931640625,
+      "sampling/sampling_logp_difference/mean": 0.01952577941119671,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.601678483595606e-06,
+      "clip_ratio/high_mean": 1.1504196208989015e-06,
+      "clip_ratio/low_mean": 4.089345225111174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2043871189889614e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 6497.28125,
+      "completions/mean_terminated_length": 6340.349609375,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "entropy": 0.8902791813015938,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015076796989887953,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 183058249.0,
+      "reward": 0.4453125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000579357147217,
+      "sampling/importance_sampling_ratio/min": 0.011128061451017857,
+      "sampling/sampling_logp_difference/max": 4.498285293579102,
+      "sampling/sampling_logp_difference/mean": 0.019255032762885094,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 5.255413270788267e-06,
+      "clip_ratio/high_mean": 1.3138533176970668e-06,
+      "clip_ratio/low_mean": 3.985653711424675e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1170390431943815e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14710.0,
+      "completions/max_terminated_length": 14710.0,
+      "completions/mean_length": 4411.4453125,
+      "completions/mean_terminated_length": 4411.4453125,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "entropy": 1.104304239153862,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002237006789073348,
+      "learning_rate": 1e-05,
+      "loss": 0.1124,
+      "num_tokens": 183645026.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22461041808128357,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000056028366089,
+      "sampling/importance_sampling_ratio/min": 4.804155082638317e-07,
+      "sampling/sampling_logp_difference/max": 14.548614501953125,
+      "sampling/sampling_logp_difference/mean": 0.020417846739292145,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 4.956973498337902e-06,
+      "clip_ratio/high_mean": 1.2392433745844755e-06,
+      "clip_ratio/low_mean": 4.839278165036376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9632024911261396e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15486.0,
+      "completions/mean_length": 5763.3828125,
+      "completions/mean_terminated_length": 5508.48828125,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 0.7673545032739639,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0027243588119745255,
+      "learning_rate": 1e-05,
+      "loss": 0.0747,
+      "num_tokens": 184402387.0,
+      "reward": 0.4375,
+      "reward_std": 0.3661494255065918,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999270439147949,
+      "sampling/importance_sampling_ratio/min": 0.0008851620368659496,
+      "sampling/sampling_logp_difference/max": 7.029739856719971,
+      "sampling/sampling_logp_difference/mean": 0.01735807955265045,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 1.412869187333854e-05,
+      "clip_ratio/high_mean": 3.532172968334635e-06,
+      "clip_ratio/low_mean": 4.364474455087475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.717691729183571e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15837.0,
+      "completions/mean_length": 6143.3125,
+      "completions/mean_terminated_length": 5980.76220703125,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "entropy": 0.9383679181337357,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016755202086642385,
+      "learning_rate": 1e-05,
+      "loss": 0.1134,
+      "num_tokens": 185207315.0,
+      "reward": 0.40625,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999443292617798,
+      "sampling/importance_sampling_ratio/min": 0.00010746628686320037,
+      "sampling/sampling_logp_difference/max": 9.138333320617676,
+      "sampling/sampling_logp_difference/mean": 0.01892942003905773,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 5.389092621044256e-06,
+      "clip_ratio/high_mean": 1.347273155261064e-06,
+      "clip_ratio/low_mean": 4.616663244405572e-05,
+      "clip_ratio/low_min": 5.818554200232029e-06,
+      "clip_ratio/region_mean": 4.7513905599316786e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16101.0,
+      "completions/mean_length": 6852.234375,
+      "completions/mean_terminated_length": 6623.47216796875,
+      "completions/min_length": 884.0,
+      "completions/min_terminated_length": 884.0,
+      "entropy": 0.9856249913573265,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0036351638846099377,
+      "learning_rate": 1e-05,
+      "loss": 0.0413,
+      "num_tokens": 186104113.0,
+      "reward": 0.375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 0.0006267272983677685,
+      "sampling/sampling_logp_difference/max": 7.374999046325684,
+      "sampling/sampling_logp_difference/mean": 0.021776381880044937,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.837307613390294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.837307613390294e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16206.0,
+      "completions/mean_length": 6634.1484375,
+      "completions/mean_terminated_length": 6479.38916015625,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "entropy": 1.0182439163327217,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003553485032171011,
+      "learning_rate": 1e-05,
+      "loss": 0.0886,
+      "num_tokens": 186973796.0,
+      "reward": 0.34375,
+      "reward_std": 0.24381662905216217,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999936819076538,
+      "sampling/importance_sampling_ratio/min": 0.00038018118357285857,
+      "sampling/sampling_logp_difference/max": 7.8748626708984375,
+      "sampling/sampling_logp_difference/mean": 0.02058180794119835,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 1.4436222500080476e-05,
+      "clip_ratio/high_mean": 3.609055625020119e-06,
+      "clip_ratio/low_mean": 5.134189859745675e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.495095410879003e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14911.0,
+      "completions/mean_length": 6424.2421875,
+      "completions/mean_terminated_length": 6266.1513671875,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "entropy": 0.9030232205986977,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002669632900506258,
+      "learning_rate": 1e-05,
+      "loss": 0.0828,
+      "num_tokens": 187820443.0,
+      "reward": 0.34375,
+      "reward_std": 0.2817176878452301,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999942183494568,
+      "sampling/importance_sampling_ratio/min": 0.004488746635615826,
+      "sampling/sampling_logp_difference/max": 5.406181812286377,
+      "sampling/sampling_logp_difference/mean": 0.01908625289797783,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 1.4932538306311471e-05,
+      "clip_ratio/high_mean": 3.733134576577868e-06,
+      "clip_ratio/low_mean": 2.516909023597691e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8902224585181102e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14473.0,
+      "completions/mean_length": 6582.21875,
+      "completions/mean_terminated_length": 6505.03955078125,
+      "completions/min_length": 20.0,
+      "completions/min_terminated_length": 20.0,
+      "entropy": 0.9906348586082458,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021964670158922672,
+      "learning_rate": 1e-05,
+      "loss": 0.0122,
+      "num_tokens": 188682111.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22908620536327362,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.01623692736029625,
+      "sampling/sampling_logp_difference/max": 4.9629387855529785,
+      "sampling/sampling_logp_difference/mean": 0.020555656403303146,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 1.3005691471335012e-05,
+      "clip_ratio/high_mean": 3.251422867833753e-06,
+      "clip_ratio/low_mean": 4.822792686809407e-05,
+      "clip_ratio/low_min": 4.575235379888909e-06,
+      "clip_ratio/region_mean": 5.147934950855415e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16333.0,
+      "completions/mean_length": 6687.8359375,
+      "completions/mean_terminated_length": 6611.48828125,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "entropy": 0.9669140502810478,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0032587468158453703,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "num_tokens": 189556570.0,
+      "reward": 0.375,
+      "reward_std": 0.36956924200057983,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.002121176104992628,
+      "sampling/sampling_logp_difference/max": 6.155784606933594,
+      "sampling/sampling_logp_difference/mean": 0.020776130259037018,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 2.541685034884722e-05,
+      "clip_ratio/high_mean": 6.354212587211805e-06,
+      "clip_ratio/low_mean": 4.488310526085115e-05,
+      "clip_ratio/low_min": 4.259959951014025e-06,
+      "clip_ratio/region_mean": 5.123731762068928e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14579.0,
+      "completions/mean_length": 5933.890625,
+      "completions/mean_terminated_length": 5851.6064453125,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 0.777520164847374,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023373132571578026,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 190333676.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3577219247817993,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999416470527649,
+      "sampling/importance_sampling_ratio/min": 1.3007656889385544e-05,
+      "sampling/sampling_logp_difference/max": 11.249972343444824,
+      "sampling/sampling_logp_difference/mean": 0.017036860808730125,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 9.352454981126357e-06,
+      "clip_ratio/high_mean": 2.3381137452815892e-06,
+      "clip_ratio/low_mean": 3.286883497821691e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5206948496124824e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16254.0,
+      "completions/mean_length": 6691.53125,
+      "completions/mean_terminated_length": 6537.68310546875,
+      "completions/min_length": 797.0,
+      "completions/min_terminated_length": 797.0,
+      "entropy": 1.0021202191710472,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033220481127500534,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 191208240.0,
+      "reward": 0.2265625,
+      "reward_std": 0.23987272381782532,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999876618385315,
+      "sampling/importance_sampling_ratio/min": 0.006665683817118406,
+      "sampling/sampling_logp_difference/max": 5.010782718658447,
+      "sampling/sampling_logp_difference/mean": 0.02151130512356758,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 2.0475443307077512e-05,
+      "clip_ratio/high_mean": 5.118860826769378e-06,
+      "clip_ratio/low_mean": 4.199072691335459e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7109587512750295e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15653.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 5480.5078125,
+      "completions/mean_terminated_length": 5480.5078125,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "entropy": 0.774504691362381,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002824194496497512,
+      "learning_rate": 1e-05,
+      "loss": 0.0472,
+      "num_tokens": 191927753.0,
+      "reward": 0.5078125,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999160766601562,
+      "sampling/importance_sampling_ratio/min": 2.561557721492136e-06,
+      "sampling/sampling_logp_difference/max": 12.874895095825195,
+      "sampling/sampling_logp_difference/mean": 0.01758616417646408,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.71521939541708e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.71521939541708e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16232.0,
+      "completions/mean_length": 6245.171875,
+      "completions/mean_terminated_length": 6001.84033203125,
+      "completions/min_length": 620.0,
+      "completions/min_terminated_length": 620.0,
+      "entropy": 0.9671605005860329,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020431289449334145,
+      "learning_rate": 1e-05,
+      "loss": 0.0527,
+      "num_tokens": 192746327.0,
+      "reward": 0.3359375,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999756813049316,
+      "sampling/importance_sampling_ratio/min": 7.518127677030861e-05,
+      "sampling/sampling_logp_difference/max": 9.49560832977295,
+      "sampling/sampling_logp_difference/mean": 0.02066320925951004,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 1.1142639777972363e-05,
+      "clip_ratio/high_mean": 2.7856599444930907e-06,
+      "clip_ratio/low_mean": 4.276063509678352e-05,
+      "clip_ratio/low_min": 3.055412889807485e-06,
+      "clip_ratio/region_mean": 4.554629526865028e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16371.0,
+      "completions/max_terminated_length": 15709.0,
+      "completions/mean_length": 6828.8515625,
+      "completions/mean_terminated_length": 6677.38916015625,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 0.9914879351854324,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0019144542748108506,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193643468.0,
+      "reward": 0.34375,
+      "reward_std": 0.3264309763908386,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000360012054443,
+      "sampling/importance_sampling_ratio/min": 0.0003172139695379883,
+      "sampling/sampling_logp_difference/max": 8.055933952331543,
+      "sampling/sampling_logp_difference/mean": 0.020327996462583542,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 1.3134391338098794e-05,
+      "clip_ratio/high_mean": 3.2835978345246986e-06,
+      "clip_ratio/low_mean": 5.683154779489996e-05,
+      "clip_ratio/low_min": 4.3356108108127955e-06,
+      "clip_ratio/region_mean": 6.011514608417201e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16289.0,
+      "completions/mean_length": 6280.125,
+      "completions/mean_terminated_length": 5954.193359375,
+      "completions/min_length": 91.0,
+      "completions/min_terminated_length": 91.0,
+      "entropy": 0.8634965419769287,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022551591973751783,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 194465324.0,
+      "reward": 0.46875,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999502897262573,
+      "sampling/importance_sampling_ratio/min": 0.003390352241694927,
+      "sampling/sampling_logp_difference/max": 5.686821460723877,
+      "sampling/sampling_logp_difference/mean": 0.019659511744976044,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.619306153268553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619306153268553e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15646.0,
+      "completions/mean_length": 6910.5625,
+      "completions/mean_terminated_length": 6525.46337890625,
+      "completions/min_length": 1225.0,
+      "completions/min_terminated_length": 1225.0,
+      "entropy": 0.9886282533407211,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0012924466282129288,
+      "learning_rate": 1e-05,
+      "loss": 0.0753,
+      "num_tokens": 195369580.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2590838074684143,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000083327293396,
+      "sampling/importance_sampling_ratio/min": 1.0787954124680255e-05,
+      "sampling/sampling_logp_difference/max": 11.437080383300781,
+      "sampling/sampling_logp_difference/mean": 0.020975295454263687,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.377244143441203e-05,
+      "clip_ratio/high_mean": 3.4431103586030076e-06,
+      "clip_ratio/low_mean": 2.4107489650759817e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7550600123049662e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12768.0,
+      "completions/mean_length": 5647.53125,
+      "completions/mean_terminated_length": 5562.9921875,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 0.8360519111156464,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019667574670165777,
+      "learning_rate": 1e-05,
+      "loss": 0.0333,
+      "num_tokens": 196110328.0,
+      "reward": 0.4921875,
+      "reward_std": 0.33508312702178955,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999792575836182,
+      "sampling/importance_sampling_ratio/min": 0.00731487525627017,
+      "sampling/sampling_logp_difference/max": 4.917845249176025,
+      "sampling/sampling_logp_difference/mean": 0.017768483608961105,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.114784731726104e-05,
+      "clip_ratio/high_mean": 2.78696182931526e-06,
+      "clip_ratio/low_mean": 2.6054579166157055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8841540995472315e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15671.0,
+      "completions/mean_length": 6249.6171875,
+      "completions/mean_terminated_length": 6088.75439453125,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "entropy": 0.837661437690258,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017836211482062936,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 196926255.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2585548758506775,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999443888664246,
+      "sampling/importance_sampling_ratio/min": 8.313281432492658e-05,
+      "sampling/sampling_logp_difference/max": 9.395071029663086,
+      "sampling/sampling_logp_difference/mean": 0.018142729997634888,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 3.1028919238451635e-06,
+      "clip_ratio/high_mean": 7.757229809612909e-07,
+      "clip_ratio/low_mean": 5.6368714012933196e-05,
+      "clip_ratio/low_min": 5.583348411164479e-06,
+      "clip_ratio/region_mean": 5.7144436595990555e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14663.0,
+      "completions/mean_length": 5561.796875,
+      "completions/mean_terminated_length": 5476.58251953125,
+      "completions/min_length": 325.0,
+      "completions/min_terminated_length": 325.0,
+      "entropy": 1.0337117239832878,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0032067650463432074,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 197657021.0,
+      "reward": 0.421875,
+      "reward_std": 0.3603675961494446,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000261068344116,
+      "sampling/importance_sampling_ratio/min": 0.0026236141566187143,
+      "sampling/sampling_logp_difference/max": 5.943202495574951,
+      "sampling/sampling_logp_difference/mean": 0.02046290785074234,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 2.244927713945799e-05,
+      "clip_ratio/high_mean": 5.612319284864498e-06,
+      "clip_ratio/low_mean": 3.963059293710103e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5242911710374756e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14806.0,
+      "completions/mean_length": 7230.09375,
+      "completions/mean_terminated_length": 7010.400390625,
+      "completions/min_length": 858.0,
+      "completions/min_terminated_length": 858.0,
+      "entropy": 0.9666887000203133,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002695069881156087,
+      "learning_rate": 1e-05,
+      "loss": 0.0321,
+      "num_tokens": 198604673.0,
+      "reward": 0.390625,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999954104423523,
+      "sampling/importance_sampling_ratio/min": 0.004087009001523256,
+      "sampling/sampling_logp_difference/max": 5.499941825866699,
+      "sampling/sampling_logp_difference/mean": 0.021222755312919617,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 6.0509246395668015e-06,
+      "clip_ratio/high_mean": 3.018646339114639e-06,
+      "clip_ratio/low_mean": 4.125545319766388e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4274099309404846e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14964.0,
+      "completions/mean_length": 7186.09375,
+      "completions/mean_terminated_length": 7040.095703125,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "entropy": 0.9754119142889977,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014327351236715913,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 199545181.0,
+      "reward": 0.328125,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999538660049438,
+      "sampling/importance_sampling_ratio/min": 3.340628245496191e-05,
+      "sampling/sampling_logp_difference/max": 10.306766510009766,
+      "sampling/sampling_logp_difference/mean": 0.02061491459608078,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.3521318351195077e-05,
+      "clip_ratio/high_mean": 3.3803295877987694e-06,
+      "clip_ratio/low_mean": 4.744600971662294e-05,
+      "clip_ratio/low_min": 4.111165708309272e-06,
+      "clip_ratio/region_mean": 5.08263395886388e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15812.0,
+      "completions/mean_length": 7464.1328125,
+      "completions/mean_terminated_length": 7322.5478515625,
+      "completions/min_length": 994.0,
+      "completions/min_terminated_length": 994.0,
+      "entropy": 1.0257701128721237,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017415130278095603,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 200521262.0,
+      "reward": 0.296875,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000050067901611,
+      "sampling/importance_sampling_ratio/min": 0.004382971208542585,
+      "sampling/sampling_logp_difference/max": 5.430028438568115,
+      "sampling/sampling_logp_difference/mean": 0.02146603912115097,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6656134000168095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6656134000168095e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15796.0,
+      "completions/mean_length": 7929.0390625,
+      "completions/mean_terminated_length": 6973.2607421875,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.8728866130113602,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018543615005910397,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "num_tokens": 201553491.0,
+      "reward": 0.25,
+      "reward_std": 0.3237725794315338,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999157786369324,
+      "sampling/importance_sampling_ratio/min": 0.0002044498542090878,
+      "sampling/sampling_logp_difference/max": 8.495187759399414,
+      "sampling/sampling_logp_difference/mean": 0.01925993338227272,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.5812252968316898e-05,
+      "clip_ratio/high_mean": 3.9530632420792244e-06,
+      "clip_ratio/low_mean": 4.320342043229175e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.715648356068414e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15938.0,
+      "completions/mean_length": 6577.84375,
+      "completions/mean_terminated_length": 6261.51611328125,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "entropy": 0.759723886847496,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001268691150471568,
+      "learning_rate": 1e-05,
+      "loss": 0.117,
+      "num_tokens": 202411655.0,
+      "reward": 0.515625,
+      "reward_std": 0.34822866320610046,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999426603317261,
+      "sampling/importance_sampling_ratio/min": 0.0004213420324958861,
+      "sampling/sampling_logp_difference/max": 7.77206563949585,
+      "sampling/sampling_logp_difference/mean": 0.018232906237244606,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.175654944698181e-05,
+      "clip_ratio/low_min": 8.377270660275826e-06,
+      "clip_ratio/region_mean": 3.175654944698181e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16117.0,
+      "completions/max_terminated_length": 16117.0,
+      "completions/mean_length": 6513.65625,
+      "completions/mean_terminated_length": 6513.65625,
+      "completions/min_length": 858.0,
+      "completions/min_terminated_length": 858.0,
+      "entropy": 1.0247815549373627,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004479583352804184,
+      "learning_rate": 1e-05,
+      "loss": -0.0114,
+      "num_tokens": 203265811.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999909400939941,
+      "sampling/importance_sampling_ratio/min": 0.011329792439937592,
+      "sampling/sampling_logp_difference/max": 4.480319499969482,
+      "sampling/sampling_logp_difference/mean": 0.02229863964021206,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 5.371261522668647e-06,
+      "clip_ratio/high_mean": 1.3428153806671617e-06,
+      "clip_ratio/low_mean": 4.290480364943505e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4247618916415377e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16035.0,
+      "completions/max_terminated_length": 16035.0,
+      "completions/mean_length": 6013.6171875,
+      "completions/mean_terminated_length": 6013.6171875,
+      "completions/min_length": 535.0,
+      "completions/min_terminated_length": 535.0,
+      "entropy": 0.8476304411888123,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017210334772244096,
+      "learning_rate": 1e-05,
+      "loss": 0.0986,
+      "num_tokens": 204054186.0,
+      "reward": 0.5078125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998961687088013,
+      "sampling/importance_sampling_ratio/min": 3.32363242705469e-06,
+      "sampling/sampling_logp_difference/max": 12.614452362060547,
+      "sampling/sampling_logp_difference/mean": 0.018720701336860657,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.4894108517182758e-05,
+      "clip_ratio/high_mean": 3.7235271292956895e-06,
+      "clip_ratio/low_mean": 3.136672694381559e-05,
+      "clip_ratio/low_min": 3.941974227927858e-06,
+      "clip_ratio/region_mean": 3.509025418679812e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14989.0,
+      "completions/max_terminated_length": 14989.0,
+      "completions/mean_length": 7090.2109375,
+      "completions/mean_terminated_length": 7090.2109375,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.9804464280605316,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003268485888838768,
+      "learning_rate": 1e-05,
+      "loss": 0.0441,
+      "num_tokens": 204982085.0,
+      "reward": 0.3828125,
+      "reward_std": 0.23751860857009888,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999740719795227,
+      "sampling/importance_sampling_ratio/min": 6.605670205317438e-05,
+      "sampling/sampling_logp_difference/max": 9.62499713897705,
+      "sampling/sampling_logp_difference/mean": 0.021524619311094284,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.3869113445252879e-05,
+      "clip_ratio/high_mean": 3.4672783613132196e-06,
+      "clip_ratio/low_mean": 3.1164222662027896e-05,
+      "clip_ratio/low_min": 2.928154799519689e-06,
+      "clip_ratio/region_mean": 3.46315009664977e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 6272.65625,
+      "completions/mean_terminated_length": 6112.1591796875,
+      "completions/min_length": 65.0,
+      "completions/min_terminated_length": 65.0,
+      "entropy": 0.8322838544845581,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002127156127244234,
+      "learning_rate": 1e-05,
+      "loss": 0.0142,
+      "num_tokens": 205805529.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3385029733181,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999648928642273,
+      "sampling/importance_sampling_ratio/min": 0.00019322636944707483,
+      "sampling/sampling_logp_difference/max": 8.551648139953613,
+      "sampling/sampling_logp_difference/mean": 0.018514126539230347,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 7.213966455310583e-06,
+      "clip_ratio/high_mean": 4.349803020886611e-06,
+      "clip_ratio/low_mean": 3.907777556833025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3427579043964215e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6422.7109375,
+      "completions/mean_terminated_length": 5846.43798828125,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "entropy": 0.8222996592521667,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001939435489475727,
+      "learning_rate": 1e-05,
+      "loss": 0.1001,
+      "num_tokens": 206647908.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26143795251846313,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 6.205694808159024e-05,
+      "sampling/sampling_logp_difference/max": 9.687458038330078,
+      "sampling/sampling_logp_difference/mean": 0.018810249865055084,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 2.1247945142022218e-05,
+      "clip_ratio/high_mean": 6.189401005940454e-06,
+      "clip_ratio/low_mean": 4.7238423121598316e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.342782378647826e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15664.0,
+      "completions/mean_length": 6179.8046875,
+      "completions/mean_terminated_length": 6099.45654296875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.031787522137165,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002615252509713173,
+      "learning_rate": 1e-05,
+      "loss": 0.0147,
+      "num_tokens": 207459043.0,
+      "reward": 0.5,
+      "reward_std": 0.3232533931732178,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000026226043701,
+      "sampling/importance_sampling_ratio/min": 1.9359204088686965e-05,
+      "sampling/sampling_logp_difference/max": 10.85234260559082,
+      "sampling/sampling_logp_difference/mean": 0.020463883876800537,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.5109407349409594e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5109407349409594e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16052.0,
+      "completions/mean_length": 7093.5390625,
+      "completions/mean_terminated_length": 6474.17529296875,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.8378612920641899,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002656357828527689,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208389800.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998178482055664,
+      "sampling/importance_sampling_ratio/min": 2.1559546439675614e-05,
+      "sampling/sampling_logp_difference/max": 10.744691848754883,
+      "sampling/sampling_logp_difference/mean": 0.01860899105668068,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7354818396597693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7354818396597693e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 7782.46875,
+      "completions/mean_terminated_length": 7576.34423828125,
+      "completions/min_length": 85.0,
+      "completions/min_terminated_length": 85.0,
+      "entropy": 1.0068294331431389,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026847824919968843,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 209407212.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2188364565372467,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 2.5824127078521997e-05,
+      "sampling/sampling_logp_difference/max": 10.564201354980469,
+      "sampling/sampling_logp_difference/mean": 0.021435359492897987,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 1.5335908301494783e-05,
+      "clip_ratio/high_mean": 3.833977075373696e-06,
+      "clip_ratio/low_mean": 3.303791140751855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6871888482892246e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 6713.3359375,
+      "completions/mean_terminated_length": 6637.18896484375,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "entropy": 0.8899351507425308,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019718443509191275,
+      "learning_rate": 1e-05,
+      "loss": 0.0167,
+      "num_tokens": 210286983.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29719969630241394,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000264644622803,
+      "sampling/importance_sampling_ratio/min": 8.772138971835375e-05,
+      "sampling/sampling_logp_difference/max": 9.341344833374023,
+      "sampling/sampling_logp_difference/mean": 0.019354315474629402,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 2.0819897144974675e-05,
+      "clip_ratio/high_mean": 5.204974286243669e-06,
+      "clip_ratio/low_mean": 3.656347121250292e-05,
+      "clip_ratio/low_min": 5.0166554501629435e-06,
+      "clip_ratio/region_mean": 4.176844549874659e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14552.0,
+      "completions/mean_length": 6275.5390625,
+      "completions/mean_terminated_length": 6115.087890625,
+      "completions/min_length": 663.0,
+      "completions/min_terminated_length": 663.0,
+      "entropy": 0.901648998260498,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.0029727297369390726,
+      "learning_rate": 1e-05,
+      "loss": 0.0593,
+      "num_tokens": 211107380.0,
+      "reward": 0.40625,
+      "reward_std": 0.4373784065246582,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999792575836182,
+      "sampling/importance_sampling_ratio/min": 0.00043164435192011297,
+      "sampling/sampling_logp_difference/max": 7.747908592224121,
+      "sampling/sampling_logp_difference/mean": 0.019338306039571762,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 4.363734251455753e-05,
+      "clip_ratio/high_mean": 1.2403264463500818e-05,
+      "clip_ratio/low_mean": 4.217202859990721e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4575292381287e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 5959.7578125,
+      "completions/mean_terminated_length": 5877.67724609375,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "entropy": 0.8542912155389786,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028311724308878183,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 211890237.0,
+      "reward": 0.515625,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 0.0007836154545657337,
+      "sampling/sampling_logp_difference/max": 7.151592254638672,
+      "sampling/sampling_logp_difference/mean": 0.018685901537537575,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 1.514913219580194e-05,
+      "clip_ratio/high_mean": 3.787283048950485e-06,
+      "clip_ratio/low_mean": 3.2207174626819324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5994458357890835e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16008.0,
+      "completions/mean_length": 6882.1875,
+      "completions/mean_terminated_length": 6575.67724609375,
+      "completions/min_length": 1170.0,
+      "completions/min_terminated_length": 1170.0,
+      "entropy": 0.9642625227570534,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002143653342500329,
+      "learning_rate": 1e-05,
+      "loss": 0.0127,
+      "num_tokens": 212792813.0,
+      "reward": 0.359375,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999080896377563,
+      "sampling/importance_sampling_ratio/min": 0.0034667642321437597,
+      "sampling/sampling_logp_difference/max": 5.664533615112305,
+      "sampling/sampling_logp_difference/mean": 0.020183943212032318,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.7900180637298035e-05,
+      "clip_ratio/high_mean": 4.475045159324509e-06,
+      "clip_ratio/low_mean": 3.741970294868224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1894748392223846e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 6941.8828125,
+      "completions/mean_terminated_length": 6715.2724609375,
+      "completions/min_length": 978.0,
+      "completions/min_terminated_length": 978.0,
+      "entropy": 0.9488044381141663,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014945612056180835,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 213703638.0,
+      "reward": 0.3984375,
+      "reward_std": 0.24329257011413574,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999405145645142,
+      "sampling/importance_sampling_ratio/min": 0.0005360813229344785,
+      "sampling/sampling_logp_difference/max": 7.531224727630615,
+      "sampling/sampling_logp_difference/mean": 0.02019106224179268,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 4.028359853691654e-06,
+      "clip_ratio/high_mean": 1.0070899634229136e-06,
+      "clip_ratio/low_mean": 4.494676113608875e-05,
+      "clip_ratio/low_min": 3.771535375562962e-06,
+      "clip_ratio/region_mean": 4.595385098582483e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14403.0,
+      "completions/mean_length": 6453.2109375,
+      "completions/mean_terminated_length": 6295.57958984375,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "entropy": 0.9140987247228622,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001788914087228477,
+      "learning_rate": 1e-05,
+      "loss": 0.0573,
+      "num_tokens": 214551065.0,
+      "reward": 0.3984375,
+      "reward_std": 0.34245961904525757,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999093413352966,
+      "sampling/importance_sampling_ratio/min": 6.614608719246462e-05,
+      "sampling/sampling_logp_difference/max": 9.623644828796387,
+      "sampling/sampling_logp_difference/mean": 0.01938386633992195,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.3890341051592259e-05,
+      "clip_ratio/high_mean": 3.4725852628980647e-06,
+      "clip_ratio/low_mean": 2.701378042502256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0486365801607462e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16074.0,
+      "completions/mean_length": 7625.375,
+      "completions/mean_terminated_length": 7556.4091796875,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.9313022494316101,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023314026184380054,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 215546625.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999741315841675,
+      "sampling/importance_sampling_ratio/min": 3.250058568937675e-07,
+      "sampling/sampling_logp_difference/max": 14.939422607421875,
+      "sampling/sampling_logp_difference/mean": 0.020401259884238243,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 2.9235679903649725e-05,
+      "clip_ratio/high_mean": 7.308919975912431e-06,
+      "clip_ratio/low_mean": 2.5110286742346943e-05,
+      "clip_ratio/low_min": 3.1065162602317287e-06,
+      "clip_ratio/region_mean": 3.24192064908857e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16084.0,
+      "completions/mean_length": 6315.3046875,
+      "completions/mean_terminated_length": 6155.484375,
+      "completions/min_length": 920.0,
+      "completions/min_terminated_length": 920.0,
+      "entropy": 0.8942855522036552,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003379981964826584,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 216377176.0,
+      "reward": 0.421875,
+      "reward_std": 0.31587696075439453,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999739527702332,
+      "sampling/importance_sampling_ratio/min": 0.008766444399952888,
+      "sampling/sampling_logp_difference/max": 4.736824035644531,
+      "sampling/sampling_logp_difference/mean": 0.01958339475095272,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 1.070113876266987e-05,
+      "clip_ratio/high_mean": 2.6752846906674677e-06,
+      "clip_ratio/low_mean": 3.970586050172642e-05,
+      "clip_ratio/low_min": 5.915619567531394e-06,
+      "clip_ratio/region_mean": 4.238114468080312e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15699.0,
+      "completions/mean_length": 7196.7109375,
+      "completions/mean_terminated_length": 6823.24365234375,
+      "completions/min_length": 741.0,
+      "completions/min_terminated_length": 741.0,
+      "entropy": 1.0663049817085266,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025235258508473635,
+      "learning_rate": 1e-05,
+      "loss": 0.0662,
+      "num_tokens": 217316755.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2893138825893402,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999923586845398,
+      "sampling/importance_sampling_ratio/min": 0.0007813565316610038,
+      "sampling/sampling_logp_difference/max": 7.154479026794434,
+      "sampling/sampling_logp_difference/mean": 0.02093672752380371,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 3.7446132409968413e-05,
+      "clip_ratio/high_mean": 1.0083826055051759e-05,
+      "clip_ratio/low_mean": 5.169025735085597e-05,
+      "clip_ratio/low_min": 5.641812549583847e-06,
+      "clip_ratio/region_mean": 6.177408295116038e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16286.0,
+      "completions/max_terminated_length": 16286.0,
+      "completions/mean_length": 6770.59375,
+      "completions/mean_terminated_length": 6770.59375,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 1.0205552130937576,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038966729771345854,
+      "learning_rate": 1e-05,
+      "loss": 0.0849,
+      "num_tokens": 218203975.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9994924068450928,
+      "sampling/importance_sampling_ratio/min": 2.5875104370243207e-07,
+      "sampling/sampling_logp_difference/max": 15.167399406433105,
+      "sampling/sampling_logp_difference/mean": 0.025428105145692825,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 3.3825838272605324e-06,
+      "clip_ratio/high_mean": 8.456459568151331e-07,
+      "clip_ratio/low_mean": 2.8302461942075752e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9148108296794817e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15749.0,
+      "completions/mean_length": 7115.6953125,
+      "completions/mean_terminated_length": 6968.57958984375,
+      "completions/min_length": 540.0,
+      "completions/min_terminated_length": 540.0,
+      "entropy": 1.0728939920663834,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025828159414231777,
+      "learning_rate": 1e-05,
+      "loss": 0.0422,
+      "num_tokens": 219134568.0,
+      "reward": 0.2890625,
+      "reward_std": 0.21990221738815308,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999753832817078,
+      "sampling/importance_sampling_ratio/min": 0.0019932277500629425,
+      "sampling/sampling_logp_difference/max": 6.2179999351501465,
+      "sampling/sampling_logp_difference/mean": 0.02109808847308159,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 8.590399147578864e-06,
+      "clip_ratio/high_mean": 2.147599786894716e-06,
+      "clip_ratio/low_mean": 4.2856369077526324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5003969148638134e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15560.0,
+      "completions/mean_length": 6160.125,
+      "completions/mean_terminated_length": 5914.75244140625,
+      "completions/min_length": 81.0,
+      "completions/min_terminated_length": 81.0,
+      "entropy": 0.8673425689339638,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002692030044272542,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 219943376.0,
+      "reward": 0.4375,
+      "reward_std": 0.34717273712158203,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998801350593567,
+      "sampling/importance_sampling_ratio/min": 0.0021331151947379112,
+      "sampling/sampling_logp_difference/max": 6.150171756744385,
+      "sampling/sampling_logp_difference/mean": 0.01947931945323944,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.4606903429667e-05,
+      "clip_ratio/low_min": 4.498344424064271e-06,
+      "clip_ratio/region_mean": 4.4606903429667e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14763.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5778.0234375,
+      "completions/mean_terminated_length": 5778.0234375,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 1.1366781443357468,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002457446651533246,
+      "learning_rate": 1e-05,
+      "loss": 0.0399,
+      "num_tokens": 220702603.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3400956988334656,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9996986985206604,
+      "sampling/importance_sampling_ratio/min": 1.4515491386646318e-07,
+      "sampling/sampling_logp_difference/max": 15.745464324951172,
+      "sampling/sampling_logp_difference/mean": 0.021183129400014877,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 6.248437784961425e-06,
+      "clip_ratio/high_mean": 2.4186024347727653e-06,
+      "clip_ratio/low_mean": 1.783873301519634e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.025733522259543e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 7509.078125,
+      "completions/mean_terminated_length": 7296.08056640625,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "entropy": 1.071702554821968,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002503670286387205,
+      "learning_rate": 1e-05,
+      "loss": -0.0088,
+      "num_tokens": 221683925.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999822378158569,
+      "sampling/importance_sampling_ratio/min": 0.00013993355969432741,
+      "sampling/sampling_logp_difference/max": 8.874342918395996,
+      "sampling/sampling_logp_difference/mean": 0.021589912474155426,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 2.347871304664295e-05,
+      "clip_ratio/high_mean": 6.97559880791232e-06,
+      "clip_ratio/low_mean": 2.81686479866039e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.514424770401092e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15153.0,
+      "completions/mean_length": 7383.03125,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "entropy": 0.8432145267724991,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002299589104950428,
+      "learning_rate": 1e-05,
+      "loss": 0.0212,
+      "num_tokens": 222648865.0,
+      "reward": 0.3125,
+      "reward_std": 0.2845909595489502,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999674558639526,
+      "sampling/importance_sampling_ratio/min": 2.8099755581934005e-05,
+      "sampling/sampling_logp_difference/max": 10.47974967956543,
+      "sampling/sampling_logp_difference/mean": 0.018576428294181824,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 9.285309715778567e-06,
+      "clip_ratio/high_mean": 3.327153194732091e-06,
+      "clip_ratio/low_mean": 3.823394035862293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.156109298492083e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 6628.921875,
+      "completions/mean_terminated_length": 6552.1103515625,
+      "completions/min_length": 903.0,
+      "completions/min_terminated_length": 903.0,
+      "entropy": 0.9039670825004578,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024530349764972925,
+      "learning_rate": 1e-05,
+      "loss": 0.1161,
+      "num_tokens": 223519175.0,
+      "reward": 0.59375,
+      "reward_std": 0.3537701964378357,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999820590019226,
+      "sampling/importance_sampling_ratio/min": 0.0003009368374478072,
+      "sampling/sampling_logp_difference/max": 8.108610153198242,
+      "sampling/sampling_logp_difference/mean": 0.01871109940111637,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 1.5403714087369735e-05,
+      "clip_ratio/high_mean": 3.850928521842434e-06,
+      "clip_ratio/low_mean": 3.431152225630285e-05,
+      "clip_ratio/low_min": 4.570718374452554e-06,
+      "clip_ratio/region_mean": 3.816245106236238e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16081.0,
+      "completions/mean_length": 7335.3359375,
+      "completions/mean_terminated_length": 7118.16845703125,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.8435061648488045,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019706569146364927,
+      "learning_rate": 1e-05,
+      "loss": 0.0068,
+      "num_tokens": 224479306.0,
+      "reward": 0.34375,
+      "reward_std": 0.28223684430122375,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 3.288762854936067e-06,
+      "sampling/sampling_logp_difference/max": 12.624999046325684,
+      "sampling/sampling_logp_difference/mean": 0.018783386796712875,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 1.979319677047897e-05,
+      "clip_ratio/high_mean": 4.948299192619743e-06,
+      "clip_ratio/low_mean": 2.4465696469633258e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9413995889626676e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16333.0,
+      "completions/mean_length": 6052.1953125,
+      "completions/mean_terminated_length": 5718.9111328125,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "entropy": 0.8186529725790024,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001542358542792499,
+      "learning_rate": 1e-05,
+      "loss": 0.0906,
+      "num_tokens": 225273523.0,
+      "reward": 0.46875,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0017039870144799352,
+      "sampling/sampling_logp_difference/max": 6.374784469604492,
+      "sampling/sampling_logp_difference/mean": 0.0183861143887043,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 2.5990090307459468e-05,
+      "clip_ratio/high_mean": 6.497522576864867e-06,
+      "clip_ratio/low_mean": 5.721013076254167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.370765299834602e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13648.0,
+      "completions/mean_length": 6560.75,
+      "completions/mean_terminated_length": 6404.82568359375,
+      "completions/min_length": 703.0,
+      "completions/min_terminated_length": 703.0,
+      "entropy": 1.0198248624801636,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002488402184098959,
+      "learning_rate": 1e-05,
+      "loss": 0.0646,
+      "num_tokens": 226134235.0,
+      "reward": 0.375,
+      "reward_std": 0.3805803954601288,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 5.428586973721394e-06,
+      "sampling/sampling_logp_difference/max": 12.123831748962402,
+      "sampling/sampling_logp_difference/mean": 0.020803291350603104,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 1.1638113846856868e-05,
+      "clip_ratio/high_mean": 2.909528461714217e-06,
+      "clip_ratio/low_mean": 3.2134936191141605e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.504446431179531e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12694.0,
+      "completions/max_terminated_length": 12694.0,
+      "completions/mean_length": 5217.140625,
+      "completions/mean_terminated_length": 5217.140625,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.8947679325938225,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035258245188742876,
+      "learning_rate": 1e-05,
+      "loss": 0.1095,
+      "num_tokens": 226821989.0,
+      "reward": 0.6015625,
+      "reward_std": 0.4092749357223511,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998952150344849,
+      "sampling/importance_sampling_ratio/min": 1.0208474122919142e-05,
+      "sampling/sampling_logp_difference/max": 11.492292404174805,
+      "sampling/sampling_logp_difference/mean": 0.018339669331908226,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 1.1735807220247807e-05,
+      "clip_ratio/high_mean": 2.9339518050619517e-06,
+      "clip_ratio/low_mean": 1.676440933806589e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9698360574693652e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16124.0,
+      "completions/mean_length": 7622.609375,
+      "completions/mean_terminated_length": 7483.5400390625,
+      "completions/min_length": 835.0,
+      "completions/min_terminated_length": 835.0,
+      "entropy": 0.760207436978817,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001208966481499374,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 227815683.0,
+      "reward": 0.4609375,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998728632926941,
+      "sampling/importance_sampling_ratio/min": 4.0069728129310533e-05,
+      "sampling/sampling_logp_difference/max": 10.124889373779297,
+      "sampling/sampling_logp_difference/mean": 0.018406979739665985,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.5826797437057394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5826797437057394e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 5981.90625,
+      "completions/mean_terminated_length": 5816.7939453125,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.902967743575573,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001788424444384873,
+      "learning_rate": 1e-05,
+      "loss": 0.0531,
+      "num_tokens": 228599647.0,
+      "reward": 0.4609375,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999909520149231,
+      "sampling/importance_sampling_ratio/min": 0.0013331151567399502,
+      "sampling/sampling_logp_difference/max": 6.620236873626709,
+      "sampling/sampling_logp_difference/mean": 0.018927905708551407,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 1.6327461935361498e-05,
+      "clip_ratio/high_mean": 4.0818654838403745e-06,
+      "clip_ratio/low_mean": 3.461411097305245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.86959764000494e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15850.0,
+      "completions/mean_length": 6156.0,
+      "completions/mean_terminated_length": 5993.6513671875,
+      "completions/min_length": 734.0,
+      "completions/min_terminated_length": 734.0,
+      "entropy": 0.8951378241181374,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0039085340686142445,
+      "learning_rate": 1e-05,
+      "loss": 0.0263,
+      "num_tokens": 229405495.0,
+      "reward": 0.5234375,
+      "reward_std": 0.304566353559494,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99994957447052,
+      "sampling/importance_sampling_ratio/min": 0.007635246496647596,
+      "sampling/sampling_logp_difference/max": 4.8749799728393555,
+      "sampling/sampling_logp_difference/mean": 0.018469247967004776,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 1.3168388704798417e-05,
+      "clip_ratio/high_mean": 3.2920971761996043e-06,
+      "clip_ratio/low_mean": 3.1043596322888334e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4335693726461614e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15808.0,
+      "completions/mean_length": 7229.234375,
+      "completions/mean_terminated_length": 6933.9189453125,
+      "completions/min_length": 82.0,
+      "completions/min_terminated_length": 82.0,
+      "entropy": 1.0803911909461021,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001881407224573195,
+      "learning_rate": 1e-05,
+      "loss": 0.0616,
+      "num_tokens": 230350725.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000948905944824,
+      "sampling/importance_sampling_ratio/min": 3.536981239449233e-05,
+      "sampling/sampling_logp_difference/max": 10.249651908874512,
+      "sampling/sampling_logp_difference/mean": 0.021804997697472572,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.664479729399318e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.664479729399318e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16135.0,
+      "completions/mean_length": 7486.2734375,
+      "completions/mean_terminated_length": 6971.52880859375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 0.9674680531024933,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0015280995285138488,
+      "learning_rate": 1e-05,
+      "loss": 0.0263,
+      "num_tokens": 231330664.0,
+      "reward": 0.234375,
+      "reward_std": 0.22620804607868195,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999110102653503,
+      "sampling/importance_sampling_ratio/min": 0.010103696957230568,
+      "sampling/sampling_logp_difference/max": 4.59485387802124,
+      "sampling/sampling_logp_difference/mean": 0.02071535401046276,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 7.207103408291005e-06,
+      "clip_ratio/high_mean": 3.596102942537982e-06,
+      "clip_ratio/low_mean": 4.2366073103039525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.596217695507221e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 6439.40625,
+      "completions/mean_terminated_length": 6361.1025390625,
+      "completions/min_length": 338.0,
+      "completions/min_terminated_length": 338.0,
+      "entropy": 0.8368510156869888,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024581989273428917,
+      "learning_rate": 1e-05,
+      "loss": 0.026,
+      "num_tokens": 232174804.0,
+      "reward": 0.40625,
+      "reward_std": 0.3527044653892517,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999905228614807,
+      "sampling/importance_sampling_ratio/min": 0.0010985663393512368,
+      "sampling/sampling_logp_difference/max": 6.813749313354492,
+      "sampling/sampling_logp_difference/mean": 0.018181029707193375,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 2.0772107973243692e-05,
+      "clip_ratio/high_mean": 6.365107253714086e-06,
+      "clip_ratio/low_mean": 6.206619241311273e-05,
+      "clip_ratio/low_min": 1.0199641110375524e-05,
+      "clip_ratio/region_mean": 6.843129881417553e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15297.0,
+      "completions/mean_length": 6642.3984375,
+      "completions/mean_terminated_length": 6163.302734375,
+      "completions/min_length": 488.0,
+      "completions/min_terminated_length": 488.0,
+      "entropy": 1.080193243920803,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026200765278190374,
+      "learning_rate": 1e-05,
+      "loss": 0.1,
+      "num_tokens": 233042999.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808669090271,
+      "sampling/importance_sampling_ratio/min": 0.00035727949580177665,
+      "sampling/sampling_logp_difference/max": 7.936992168426514,
+      "sampling/sampling_logp_difference/mean": 0.020303232595324516,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 2.1764372377219843e-05,
+      "clip_ratio/high_mean": 5.441093094304961e-06,
+      "clip_ratio/low_mean": 8.049383222896722e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.593492520958534e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16205.0,
+      "completions/mean_length": 5594.3984375,
+      "completions/mean_terminated_length": 5509.44091796875,
+      "completions/min_length": 475.0,
+      "completions/min_terminated_length": 475.0,
+      "entropy": 0.8376244381070137,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028024003840982914,
+      "learning_rate": 1e-05,
+      "loss": 0.0317,
+      "num_tokens": 233778538.0,
+      "reward": 0.390625,
+      "reward_std": 0.3566610813140869,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999902844429016,
+      "sampling/importance_sampling_ratio/min": 0.030517347157001495,
+      "sampling/sampling_logp_difference/max": 3.489459991455078,
+      "sampling/sampling_logp_difference/mean": 0.01896265149116516,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 1.9571571556298295e-05,
+      "clip_ratio/high_mean": 4.892892889074574e-06,
+      "clip_ratio/low_mean": 1.3305952052178327e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8198844827566063e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16069.0,
+      "completions/mean_length": 6939.7890625,
+      "completions/mean_terminated_length": 6635.13671875,
+      "completions/min_length": 1303.0,
+      "completions/min_terminated_length": 1303.0,
+      "entropy": 0.923162192106247,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0004863851936534047,
+      "learning_rate": 1e-05,
+      "loss": 0.0663,
+      "num_tokens": 234683871.0,
+      "reward": 0.5234375,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 4.343670661910437e-05,
+      "sampling/sampling_logp_difference/max": 10.044205665588379,
+      "sampling/sampling_logp_difference/mean": 0.018946819007396698,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 2.6291640551789897e-05,
+      "clip_ratio/high_mean": 6.572910137947474e-06,
+      "clip_ratio/low_mean": 4.438247970028897e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0955390179296955e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15671.0,
+      "completions/mean_length": 5808.1796875,
+      "completions/mean_terminated_length": 5640.31005859375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.8330265805125237,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003028205828741193,
+      "learning_rate": 1e-05,
+      "loss": 0.0318,
+      "num_tokens": 235446758.0,
+      "reward": 0.5078125,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99998539686203,
+      "sampling/importance_sampling_ratio/min": 0.05524001643061638,
+      "sampling/sampling_logp_difference/max": 3.001615524291992,
+      "sampling/sampling_logp_difference/mean": 0.018604904413223267,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 4.42854116045055e-06,
+      "clip_ratio/high_mean": 1.1071352901126374e-06,
+      "clip_ratio/low_mean": 3.1940794087859103e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.30479292642849e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7106.125,
+      "completions/mean_terminated_length": 6806.83837890625,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "entropy": 1.0014382004737854,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022615960333496332,
+      "learning_rate": 1e-05,
+      "loss": 0.0369,
+      "num_tokens": 236377494.0,
+      "reward": 0.34375,
+      "reward_std": 0.33614397048950195,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999083280563354,
+      "sampling/importance_sampling_ratio/min": 0.0008234601118601859,
+      "sampling/sampling_logp_difference/max": 7.101995468139648,
+      "sampling/sampling_logp_difference/mean": 0.02129078283905983,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 9.011766906041885e-06,
+      "clip_ratio/high_mean": 2.252941726510471e-06,
+      "clip_ratio/low_mean": 2.9379379270721984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.163232122460613e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 6830.2109375,
+      "completions/mean_terminated_length": 6360.35205078125,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "entropy": 0.8726402744650841,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002122451551258564,
+      "learning_rate": 1e-05,
+      "loss": 0.0083,
+      "num_tokens": 237269977.0,
+      "reward": 0.484375,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999386072158813,
+      "sampling/importance_sampling_ratio/min": 0.0003835389798041433,
+      "sampling/sampling_logp_difference/max": 7.866069316864014,
+      "sampling/sampling_logp_difference/mean": 0.018967002630233765,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 3.987113814218901e-06,
+      "clip_ratio/high_mean": 9.967784535547253e-07,
+      "clip_ratio/low_mean": 2.8655875098593242e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9652653552147967e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16246.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6704.171875,
+      "completions/mean_terminated_length": 6704.171875,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.9421284720301628,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.001218589604832232,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 238147359.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2012200504541397,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 0.002478870330378413,
+      "sampling/sampling_logp_difference/max": 5.99995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02092663012444973,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 8.067639100772794e-06,
+      "clip_ratio/high_mean": 2.0169097751931986e-06,
+      "clip_ratio/low_mean": 4.687528951308195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.889219928827515e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 6142.8203125,
+      "completions/mean_terminated_length": 5639.1552734375,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.1285494044423103,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003979295492172241,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 238953104.0,
+      "reward": 0.265625,
+      "reward_std": 0.2756393849849701,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.00349772023037076,
+      "sampling/sampling_logp_difference/max": 5.655643939971924,
+      "sampling/sampling_logp_difference/mean": 0.022049173712730408,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 1.4033725619810866e-05,
+      "clip_ratio/high_mean": 3.5084314049527165e-06,
+      "clip_ratio/low_mean": 2.4028336156334262e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7536767788660654e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15710.0,
+      "completions/mean_length": 5622.296875,
+      "completions/mean_terminated_length": 5275.14501953125,
+      "completions/min_length": 396.0,
+      "completions/min_terminated_length": 396.0,
+      "entropy": 0.9032362103462219,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022260278929024935,
+      "learning_rate": 1e-05,
+      "loss": 0.0068,
+      "num_tokens": 239699350.0,
+      "reward": 0.53125,
+      "reward_std": 0.2748701572418213,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999663829803467,
+      "sampling/importance_sampling_ratio/min": 9.907654748531058e-05,
+      "sampling/sampling_logp_difference/max": 9.21961784362793,
+      "sampling/sampling_logp_difference/mean": 0.018553178757429123,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 2.0970909417883377e-05,
+      "clip_ratio/high_mean": 7.081109117734741e-06,
+      "clip_ratio/low_mean": 2.478300689290336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.186411640854203e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15532.0,
+      "completions/mean_length": 7203.6640625,
+      "completions/mean_terminated_length": 6752.171875,
+      "completions/min_length": 1073.0,
+      "completions/min_terminated_length": 1073.0,
+      "entropy": 0.9958974272012711,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001666489290073514,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 240640387.0,
+      "reward": 0.484375,
+      "reward_std": 0.30327308177948,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999366998672485,
+      "sampling/importance_sampling_ratio/min": 0.003141714259982109,
+      "sampling/sampling_logp_difference/max": 5.762986660003662,
+      "sampling/sampling_logp_difference/mean": 0.02084190584719181,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 2.8518336421257118e-05,
+      "clip_ratio/high_mean": 1.1702542110469949e-05,
+      "clip_ratio/low_mean": 4.6755864048009244e-05,
+      "clip_ratio/low_min": 9.262003914045636e-06,
+      "clip_ratio/region_mean": 5.8458407011130475e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7692.4765625,
+      "completions/mean_terminated_length": 7412.2578125,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9312580227851868,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019504680531099439,
+      "learning_rate": 1e-05,
+      "loss": 0.0514,
+      "num_tokens": 241647840.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998970031738281,
+      "sampling/importance_sampling_ratio/min": 0.00011594472016440704,
+      "sampling/sampling_logp_difference/max": 9.062397003173828,
+      "sampling/sampling_logp_difference/mean": 0.02081790193915367,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 2.4005360501178075e-05,
+      "clip_ratio/high_mean": 6.001340125294519e-06,
+      "clip_ratio/low_mean": 3.910731970790948e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.510866097007238e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14196.0,
+      "completions/mean_length": 6142.09375,
+      "completions/mean_terminated_length": 6061.44873046875,
+      "completions/min_length": 967.0,
+      "completions/min_terminated_length": 967.0,
+      "entropy": 0.8636585548520088,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025938916951417923,
+      "learning_rate": 1e-05,
+      "loss": 0.0119,
+      "num_tokens": 242452692.0,
+      "reward": 0.515625,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999980926513672,
+      "sampling/importance_sampling_ratio/min": 2.320722842341638e-06,
+      "sampling/sampling_logp_difference/max": 12.973631858825684,
+      "sampling/sampling_logp_difference/mean": 0.019208990037441254,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 4.168055966147222e-06,
+      "clip_ratio/high_mean": 1.0420139915368054e-06,
+      "clip_ratio/low_mean": 3.8637008401565254e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.967902239310206e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16030.0,
+      "completions/max_terminated_length": 16030.0,
+      "completions/mean_length": 6112.6171875,
+      "completions/mean_terminated_length": 6112.6171875,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "entropy": 0.8610381335020065,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014701929176226258,
+      "learning_rate": 1e-05,
+      "loss": 0.0377,
+      "num_tokens": 243255243.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999271035194397,
+      "sampling/importance_sampling_ratio/min": 4.6073862904449925e-05,
+      "sampling/sampling_logp_difference/max": 9.985264778137207,
+      "sampling/sampling_logp_difference/mean": 0.018754754215478897,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 8.054383215494454e-06,
+      "clip_ratio/high_mean": 2.0135958038736135e-06,
+      "clip_ratio/low_mean": 4.2183424454833585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4197020486080874e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16365.0,
+      "completions/mean_length": 7204.4375,
+      "completions/mean_terminated_length": 7132.1572265625,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 1.0613816231489182,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023235646076500416,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 244198291.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3119252324104309,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999468326568604,
+      "sampling/importance_sampling_ratio/min": 3.256353693359415e-07,
+      "sampling/sampling_logp_difference/max": 14.937487602233887,
+      "sampling/sampling_logp_difference/mean": 0.02158042974770069,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 1.0963113709294703e-05,
+      "clip_ratio/high_mean": 3.833359528471192e-06,
+      "clip_ratio/low_mean": 4.1291930529041565e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5125290171199595e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16356.0,
+      "completions/mean_length": 6308.59375,
+      "completions/mean_terminated_length": 6066.7841796875,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "entropy": 0.8048126623034477,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002957145916298032,
+      "learning_rate": 1e-05,
+      "loss": 0.0926,
+      "num_tokens": 245022975.0,
+      "reward": 0.484375,
+      "reward_std": 0.3692649006843567,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999489188194275,
+      "sampling/importance_sampling_ratio/min": 0.0005304187070578337,
+      "sampling/sampling_logp_difference/max": 7.541843891143799,
+      "sampling/sampling_logp_difference/mean": 0.017426976934075356,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 1.863301304183551e-05,
+      "clip_ratio/high_mean": 4.658253260458878e-06,
+      "clip_ratio/low_mean": 7.454315527866129e-05,
+      "clip_ratio/low_min": 8.290224286611192e-06,
+      "clip_ratio/region_mean": 7.920140842543333e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6183.75,
+      "completions/mean_terminated_length": 5938.9443359375,
+      "completions/min_length": 134.0,
+      "completions/min_terminated_length": 134.0,
+      "entropy": 0.8879657089710236,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002814161591231823,
+      "learning_rate": 1e-05,
+      "loss": 0.0791,
+      "num_tokens": 245831183.0,
+      "reward": 0.46875,
+      "reward_std": 0.3156445026397705,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999352097511292,
+      "sampling/importance_sampling_ratio/min": 7.562734390376136e-05,
+      "sampling/sampling_logp_difference/max": 9.489692687988281,
+      "sampling/sampling_logp_difference/mean": 0.01883331872522831,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 9.606681487639435e-06,
+      "clip_ratio/high_mean": 2.4016703719098587e-06,
+      "clip_ratio/low_mean": 3.564927715160593e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.805094752351579e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15525.0,
+      "completions/mean_length": 5656.8984375,
+      "completions/mean_terminated_length": 5310.86279296875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8461362943053246,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00238890596665442,
+      "learning_rate": 1e-05,
+      "loss": 0.1344,
+      "num_tokens": 246576170.0,
+      "reward": 0.3984375,
+      "reward_std": 0.37609970569610596,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999550580978394,
+      "sampling/importance_sampling_ratio/min": 0.000344505300745368,
+      "sampling/sampling_logp_difference/max": 7.973401069641113,
+      "sampling/sampling_logp_difference/mean": 0.01883539929986,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 3.868412022711709e-06,
+      "clip_ratio/high_mean": 9.671030056779273e-07,
+      "clip_ratio/low_mean": 4.4275341792854306e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.524244479853223e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14949.0,
+      "completions/mean_length": 7402.484375,
+      "completions/mean_terminated_length": 7331.763671875,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.9303053691983223,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002722573932260275,
+      "learning_rate": 1e-05,
+      "loss": 0.0331,
+      "num_tokens": 247542448.0,
+      "reward": 0.359375,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998664259910583,
+      "sampling/importance_sampling_ratio/min": 0.0015035009710118175,
+      "sampling/sampling_logp_difference/max": 6.4999589920043945,
+      "sampling/sampling_logp_difference/mean": 0.020525872707366943,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 3.7332376905396814e-06,
+      "clip_ratio/high_mean": 9.333094226349203e-07,
+      "clip_ratio/low_mean": 2.2581028019885707e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3514337442520628e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15655.0,
+      "completions/mean_length": 6920.7734375,
+      "completions/mean_terminated_length": 6455.36865234375,
+      "completions/min_length": 909.0,
+      "completions/min_terminated_length": 909.0,
+      "entropy": 0.9233825877308846,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024008466862142086,
+      "learning_rate": 1e-05,
+      "loss": 0.0349,
+      "num_tokens": 248446787.0,
+      "reward": 0.328125,
+      "reward_std": 0.2359210103750229,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999996423721313,
+      "sampling/importance_sampling_ratio/min": 0.00010231315536657348,
+      "sampling/sampling_logp_difference/max": 9.187472343444824,
+      "sampling/sampling_logp_difference/mean": 0.01887384243309498,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 1.1328072105243336e-05,
+      "clip_ratio/high_mean": 2.832018026310834e-06,
+      "clip_ratio/low_mean": 3.6861969306301035e-05,
+      "clip_ratio/low_min": 4.25054395236657e-06,
+      "clip_ratio/region_mean": 3.969398790104606e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16050.0,
+      "completions/mean_length": 6658.7109375,
+      "completions/mean_terminated_length": 6504.341796875,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.9102077335119247,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016227345913648605,
+      "learning_rate": 1e-05,
+      "loss": 0.0684,
+      "num_tokens": 249318094.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2624938488006592,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998591542243958,
+      "sampling/importance_sampling_ratio/min": 0.0038418183103203773,
+      "sampling/sampling_logp_difference/max": 5.561809539794922,
+      "sampling/sampling_logp_difference/mean": 0.019931891933083534,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 5.2942118600185495e-06,
+      "clip_ratio/high_mean": 1.3235529650046374e-06,
+      "clip_ratio/low_mean": 4.644989053304016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7773443156984285e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8597.84375,
+      "completions/mean_terminated_length": 8346.6767578125,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9965319409966469,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023056245408952236,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 250435674.0,
+      "reward": 0.296875,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 0.005126871634274721,
+      "sampling/sampling_logp_difference/max": 5.27325963973999,
+      "sampling/sampling_logp_difference/mean": 0.02132929116487503,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 8.388911510337493e-06,
+      "clip_ratio/high_mean": 2.0972278775843733e-06,
+      "clip_ratio/low_mean": 4.1705150920279266e-05,
+      "clip_ratio/low_min": 5.85781890549697e-06,
+      "clip_ratio/region_mean": 4.380237885470706e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14905.0,
+      "completions/max_terminated_length": 14905.0,
+      "completions/mean_length": 6053.0390625,
+      "completions/mean_terminated_length": 6053.0390625,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "entropy": 1.0717384740710258,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022299408446997404,
+      "learning_rate": 1e-05,
+      "loss": 0.0054,
+      "num_tokens": 251232847.0,
+      "reward": 0.3515625,
+      "reward_std": 0.26143795251846313,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000006914138794,
+      "sampling/importance_sampling_ratio/min": 0.0024789744056761265,
+      "sampling/sampling_logp_difference/max": 5.999910354614258,
+      "sampling/sampling_logp_difference/mean": 0.021233227103948593,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 1.0162047374251415e-05,
+      "clip_ratio/high_mean": 2.5405118435628538e-06,
+      "clip_ratio/low_mean": 5.296576864566305e-05,
+      "clip_ratio/low_min": 8.864200026437175e-06,
+      "clip_ratio/region_mean": 5.550628043238248e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15929.0,
+      "completions/mean_length": 6553.7109375,
+      "completions/mean_terminated_length": 6476.30712890625,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "entropy": 0.9829569607973099,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0026091893669217825,
+      "learning_rate": 1e-05,
+      "loss": 0.0384,
+      "num_tokens": 252088154.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999917149543762,
+      "sampling/importance_sampling_ratio/min": 0.0010629174066707492,
+      "sampling/sampling_logp_difference/max": 6.846737861633301,
+      "sampling/sampling_logp_difference/mean": 0.020414084196090698,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 9.021045798363048e-06,
+      "clip_ratio/high_mean": 2.255261449590762e-06,
+      "clip_ratio/low_mean": 3.9386548451147974e-05,
+      "clip_ratio/low_min": 4.476596132008126e-06,
+      "clip_ratio/region_mean": 4.1641809502834803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15218.0,
+      "completions/mean_length": 6391.7421875,
+      "completions/mean_terminated_length": 5985.552734375,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.7887687161564827,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018632705323398113,
+      "learning_rate": 1e-05,
+      "loss": 0.1007,
+      "num_tokens": 252926073.0,
+      "reward": 0.4609375,
+      "reward_std": 0.33903977274894714,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999994158744812,
+      "sampling/importance_sampling_ratio/min": 0.0001141107059083879,
+      "sampling/sampling_logp_difference/max": 9.078341484069824,
+      "sampling/sampling_logp_difference/mean": 0.016558727249503136,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.388932546182332e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.388932546182332e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15492.0,
+      "completions/mean_length": 7519.140625,
+      "completions/mean_terminated_length": 7306.38427734375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "entropy": 0.8663278818130493,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014314674772322178,
+      "learning_rate": 1e-05,
+      "loss": 0.0432,
+      "num_tokens": 253908571.0,
+      "reward": 0.296875,
+      "reward_std": 0.21436560153961182,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999785423278809,
+      "sampling/importance_sampling_ratio/min": 9.006411971768102e-08,
+      "sampling/sampling_logp_difference/max": 16.22274398803711,
+      "sampling/sampling_logp_difference/mean": 0.019052794203162193,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 4.941101906297263e-06,
+      "clip_ratio/high_mean": 1.2352754765743157e-06,
+      "clip_ratio/low_mean": 1.9741319533750357e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0976595237698348e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15343.0,
+      "completions/max_terminated_length": 15343.0,
+      "completions/mean_length": 5273.7265625,
+      "completions/mean_terminated_length": 5273.7265625,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.973240926861763,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00404210714623332,
+      "learning_rate": 1e-05,
+      "loss": 0.0706,
+      "num_tokens": 254601856.0,
+      "reward": 0.4921875,
+      "reward_std": 0.25460803508758545,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999933123588562,
+      "sampling/importance_sampling_ratio/min": 5.1447856094455346e-05,
+      "sampling/sampling_logp_difference/max": 9.8749418258667,
+      "sampling/sampling_logp_difference/mean": 0.01859421283006668,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 9.725902600621339e-06,
+      "clip_ratio/high_mean": 2.4314756501553347e-06,
+      "clip_ratio/low_mean": 2.9865542501283926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2297018492499774e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16018.0,
+      "completions/mean_length": 6674.5390625,
+      "completions/mean_terminated_length": 6598.08642578125,
+      "completions/min_length": 719.0,
+      "completions/min_terminated_length": 719.0,
+      "entropy": 0.9493648260831833,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003681440372020006,
+      "learning_rate": 1e-05,
+      "loss": 0.0347,
+      "num_tokens": 255474357.0,
+      "reward": 0.359375,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998538494110107,
+      "sampling/importance_sampling_ratio/min": 4.5425484131556004e-05,
+      "sampling/sampling_logp_difference/max": 9.99943733215332,
+      "sampling/sampling_logp_difference/mean": 0.020322658121585846,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 1.3442999488688656e-05,
+      "clip_ratio/high_mean": 4.46992856950601e-06,
+      "clip_ratio/low_mean": 4.9175514504895546e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3645443131244974e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15713.0,
+      "completions/mean_length": 7113.59375,
+      "completions/mean_terminated_length": 6736.74755859375,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 0.8717286512255669,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014825655380263925,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 256405745.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999269247055054,
+      "sampling/importance_sampling_ratio/min": 0.0015039225108921528,
+      "sampling/sampling_logp_difference/max": 6.499678611755371,
+      "sampling/sampling_logp_difference/mean": 0.019822338595986366,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 2.0328425534898997e-05,
+      "clip_ratio/high_mean": 6.525457763473241e-06,
+      "clip_ratio/low_mean": 1.983899721835769e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.636445498183093e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15655.0,
+      "completions/mean_length": 5819.9765625,
+      "completions/mean_terminated_length": 5736.79541015625,
+      "completions/min_length": 608.0,
+      "completions/min_terminated_length": 608.0,
+      "entropy": 0.9206694886088371,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002185023855417967,
+      "learning_rate": 1e-05,
+      "loss": 0.0957,
+      "num_tokens": 257171214.0,
+      "reward": 0.4375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 0.0011616232804954052,
+      "sampling/sampling_logp_difference/max": 6.757936954498291,
+      "sampling/sampling_logp_difference/mean": 0.018492478877305984,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 2.2664371726932586e-05,
+      "clip_ratio/high_mean": 6.88441667762163e-06,
+      "clip_ratio/low_mean": 4.306056735003949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.994498453925189e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16170.0,
+      "completions/mean_length": 6754.7109375,
+      "completions/mean_terminated_length": 6523.6083984375,
+      "completions/min_length": 531.0,
+      "completions/min_terminated_length": 531.0,
+      "entropy": 0.8881036639213562,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022363397292792797,
+      "learning_rate": 1e-05,
+      "loss": 0.1086,
+      "num_tokens": 258064049.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0005261205951683223,
+      "sampling/sampling_logp_difference/max": 7.549980163574219,
+      "sampling/sampling_logp_difference/mean": 0.01989433914422989,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.3297232107543095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3297232107543095e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15599.0,
+      "completions/mean_length": 7953.421875,
+      "completions/mean_terminated_length": 7610.71533203125,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9007300287485123,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001413302612490952,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 259098655.0,
+      "reward": 0.3203125,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999911785125732,
+      "sampling/importance_sampling_ratio/min": 0.00017562482389621437,
+      "sampling/sampling_logp_difference/max": 8.647160530090332,
+      "sampling/sampling_logp_difference/mean": 0.019421691074967384,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 3.664743485387589e-05,
+      "clip_ratio/high_mean": 1.2026366050577053e-05,
+      "clip_ratio/low_mean": 3.211230455235636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4138670659776835e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15430.0,
+      "completions/mean_length": 6669.390625,
+      "completions/mean_terminated_length": 6515.19091796875,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.8598581254482269,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018268795683979988,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 259971017.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2896084189414978,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 1.7517091066565627e-07,
+      "sampling/sampling_logp_difference/max": 15.557503700256348,
+      "sampling/sampling_logp_difference/mean": 0.01863129623234272,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 5.219860668148613e-06,
+      "clip_ratio/high_mean": 1.3049651670371532e-06,
+      "clip_ratio/low_mean": 2.3785564053468988e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.509052933419298e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 11342.0,
+      "completions/max_terminated_length": 11342.0,
+      "completions/mean_length": 5268.2890625,
+      "completions/mean_terminated_length": 5268.2890625,
+      "completions/min_length": 818.0,
+      "completions/min_terminated_length": 818.0,
+      "entropy": 0.8647450804710388,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027839087415486574,
+      "learning_rate": 1e-05,
+      "loss": 0.1259,
+      "num_tokens": 260663534.0,
+      "reward": 0.6171875,
+      "reward_std": 0.3345640003681183,
+      "rewards/accuracy_reward/mean": 0.6171875,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998882412910461,
+      "sampling/importance_sampling_ratio/min": 0.008392918854951859,
+      "sampling/sampling_logp_difference/max": 4.780366897583008,
+      "sampling/sampling_logp_difference/mean": 0.017936093732714653,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 3.5293785458634375e-06,
+      "clip_ratio/high_mean": 8.823446364658594e-07,
+      "clip_ratio/low_mean": 3.2431569934487925e-05,
+      "clip_ratio/low_min": 3.789371476159431e-06,
+      "clip_ratio/region_mean": 3.331391440042353e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14955.0,
+      "completions/mean_length": 7037.0,
+      "completions/mean_terminated_length": 6496.26416015625,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 0.9258207008242607,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002726807491853833,
+      "learning_rate": 1e-05,
+      "loss": 0.1071,
+      "num_tokens": 261583222.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999408721923828,
+      "sampling/importance_sampling_ratio/min": 0.0004893821314908564,
+      "sampling/sampling_logp_difference/max": 7.622366905212402,
+      "sampling/sampling_logp_difference/mean": 0.019336845725774765,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 3.219348491256824e-05,
+      "clip_ratio/high_mean": 8.04837122814206e-06,
+      "clip_ratio/low_mean": 3.258790718518867e-05,
+      "clip_ratio/low_min": 6.961073722777655e-06,
+      "clip_ratio/region_mean": 4.0636279095451755e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 6469.78125,
+      "completions/mean_terminated_length": 6391.71630859375,
+      "completions/min_length": 652.0,
+      "completions/min_terminated_length": 652.0,
+      "entropy": 0.9932648614048958,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00209408369846642,
+      "learning_rate": 1e-05,
+      "loss": 0.0446,
+      "num_tokens": 262430162.0,
+      "reward": 0.375,
+      "reward_std": 0.3640199303627014,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999074339866638,
+      "sampling/importance_sampling_ratio/min": 0.003386466298252344,
+      "sampling/sampling_logp_difference/max": 5.6879682540893555,
+      "sampling/sampling_logp_difference/mean": 0.020799942314624786,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 2.827135813276982e-05,
+      "clip_ratio/high_mean": 8.08931497431331e-06,
+      "clip_ratio/low_mean": 4.0315980186278466e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.840529436478391e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15815.0,
+      "completions/max_terminated_length": 15815.0,
+      "completions/mean_length": 5471.6953125,
+      "completions/mean_terminated_length": 5471.6953125,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "entropy": 0.979861818253994,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0032934497576206923,
+      "learning_rate": 1e-05,
+      "loss": 0.0511,
+      "num_tokens": 263148331.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3440523147583008,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000145435333252,
+      "sampling/importance_sampling_ratio/min": 4.68981761514442e-06,
+      "sampling/sampling_logp_difference/max": 12.270116806030273,
+      "sampling/sampling_logp_difference/mean": 0.019479844719171524,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.3237559869594406e-05,
+      "clip_ratio/high_mean": 3.3093899673986016e-06,
+      "clip_ratio/low_mean": 5.419432636699639e-05,
+      "clip_ratio/low_min": 3.509559974190779e-06,
+      "clip_ratio/region_mean": 5.750371656176867e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16121.0,
+      "completions/mean_length": 6640.65625,
+      "completions/mean_terminated_length": 6161.47509765625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8560378029942513,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0014544804580509663,
+      "learning_rate": 1e-05,
+      "loss": 0.1159,
+      "num_tokens": 264017391.0,
+      "reward": 0.515625,
+      "reward_std": 0.31983357667922974,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999976396560669,
+      "sampling/importance_sampling_ratio/min": 0.00810791365802288,
+      "sampling/sampling_logp_difference/max": 4.814914703369141,
+      "sampling/sampling_logp_difference/mean": 0.01882140152156353,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 3.979497705586255e-06,
+      "clip_ratio/high_mean": 9.948744263965636e-07,
+      "clip_ratio/low_mean": 3.569043906281877e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.668531348921533e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16249.0,
+      "completions/mean_length": 5950.7421875,
+      "completions/mean_terminated_length": 5700.34423828125,
+      "completions/min_length": 873.0,
+      "completions/min_terminated_length": 873.0,
+      "entropy": 0.9033292010426521,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001294711953960359,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 264799326.0,
+      "reward": 0.5546875,
+      "reward_std": 0.22621294856071472,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000641345977783,
+      "sampling/importance_sampling_ratio/min": 0.0011992956278845668,
+      "sampling/sampling_logp_difference/max": 6.726020812988281,
+      "sampling/sampling_logp_difference/mean": 0.019538050517439842,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 3.0064740258239908e-05,
+      "clip_ratio/high_mean": 7.516185064559977e-06,
+      "clip_ratio/low_mean": 3.826810700502392e-05,
+      "clip_ratio/low_min": 4.875575541518629e-06,
+      "clip_ratio/region_mean": 4.578429286539176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15068.0,
+      "completions/mean_length": 6356.0703125,
+      "completions/mean_terminated_length": 6196.89697265625,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 0.8268664851784706,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022473863791674376,
+      "learning_rate": 1e-05,
+      "loss": 0.072,
+      "num_tokens": 265630895.0,
+      "reward": 0.4375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999737739562988,
+      "sampling/importance_sampling_ratio/min": 2.5895053113345057e-05,
+      "sampling/sampling_logp_difference/max": 10.561458587646484,
+      "sampling/sampling_logp_difference/mean": 0.01843554526567459,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.8887641999754123e-05,
+      "clip_ratio/high_mean": 5.5906657507875934e-06,
+      "clip_ratio/low_mean": 7.594743829031358e-05,
+      "clip_ratio/low_min": 8.592850917921169e-06,
+      "clip_ratio/region_mean": 8.153810449584853e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7955.546875,
+      "completions/mean_terminated_length": 7821.76220703125,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "entropy": 0.9475079327821732,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023036333732306957,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 266666285.0,
+      "reward": 0.421875,
+      "reward_std": 0.36008089780807495,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 1.0642166614616144e-07,
+      "sampling/sampling_logp_difference/max": 16.055856704711914,
+      "sampling/sampling_logp_difference/mean": 0.020778125151991844,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9688118729609414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9688118729609414e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16247.0,
+      "completions/mean_length": 7701.7578125,
+      "completions/mean_terminated_length": 6965.974609375,
+      "completions/min_length": 685.0,
+      "completions/min_terminated_length": 685.0,
+      "entropy": 0.8349794074892998,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0020953568164259195,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 267669230.0,
+      "reward": 0.46875,
+      "reward_std": 0.17176413536071777,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999356269836426,
+      "sampling/importance_sampling_ratio/min": 0.010210023261606693,
+      "sampling/sampling_logp_difference/max": 4.584385395050049,
+      "sampling/sampling_logp_difference/mean": 0.018453046679496765,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 1.9330177565279882e-05,
+      "clip_ratio/high_mean": 4.832544391319971e-06,
+      "clip_ratio/low_mean": 3.980111284818122e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4633657012127514e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16374.0,
+      "completions/mean_length": 7335.40625,
+      "completions/mean_terminated_length": 7118.240234375,
+      "completions/min_length": 418.0,
+      "completions/min_terminated_length": 418.0,
+      "entropy": 0.9238340929150581,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016563549870625138,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 268627714.0,
+      "reward": 0.390625,
+      "reward_std": 0.32036250829696655,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999822378158569,
+      "sampling/importance_sampling_ratio/min": 0.0011709382524713874,
+      "sampling/sampling_logp_difference/max": 6.749949932098389,
+      "sampling/sampling_logp_difference/mean": 0.019696014001965523,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 1.5036271179269534e-05,
+      "clip_ratio/high_mean": 3.7590677948173834e-06,
+      "clip_ratio/low_mean": 4.6864498017384904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.062356603957596e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15040.0,
+      "completions/max_terminated_length": 15040.0,
+      "completions/mean_length": 6259.875,
+      "completions/mean_terminated_length": 6259.875,
+      "completions/min_length": 1012.0,
+      "completions/min_terminated_length": 1012.0,
+      "entropy": 1.0842352360486984,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017849374562501907,
+      "learning_rate": 1e-05,
+      "loss": 0.0279,
+      "num_tokens": 269447338.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2977364957332611,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998852014541626,
+      "sampling/importance_sampling_ratio/min": 0.009620909579098225,
+      "sampling/sampling_logp_difference/max": 4.6438164710998535,
+      "sampling/sampling_logp_difference/mean": 0.020421095192432404,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.4728739188285545e-05,
+      "clip_ratio/high_mean": 3.682184797071386e-06,
+      "clip_ratio/low_mean": 2.7205874630453764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.08880598822725e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15784.0,
+      "completions/max_terminated_length": 15784.0,
+      "completions/mean_length": 7626.125,
+      "completions/mean_terminated_length": 7626.125,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.1077729761600494,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017999790143221617,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "num_tokens": 270444594.0,
+      "reward": 0.390625,
+      "reward_std": 0.24381662905216217,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99991375207901,
+      "sampling/importance_sampling_ratio/min": 2.4265028741865535e-07,
+      "sampling/sampling_logp_difference/max": 15.231644630432129,
+      "sampling/sampling_logp_difference/mean": 0.021409697830677032,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 1.5701789834565716e-05,
+      "clip_ratio/high_mean": 3.925447458641429e-06,
+      "clip_ratio/low_mean": 3.2665291655575857e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.659073934159096e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15531.0,
+      "completions/max_terminated_length": 15531.0,
+      "completions/mean_length": 5581.5625,
+      "completions/mean_terminated_length": 5581.5625,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "entropy": 0.8401889503002167,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031031551770865917,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 271177242.0,
+      "reward": 0.625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999693036079407,
+      "sampling/importance_sampling_ratio/min": 0.00020852939633186907,
+      "sampling/sampling_logp_difference/max": 8.475430488586426,
+      "sampling/sampling_logp_difference/mean": 0.017869479954242706,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.981169902544934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.981169902544934e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15973.0,
+      "completions/mean_length": 6442.84375,
+      "completions/mean_terminated_length": 6364.56689453125,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "entropy": 0.8304163441061974,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002635185606777668,
+      "learning_rate": 1e-05,
+      "loss": 0.037,
+      "num_tokens": 272021830.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.0004586660652421415,
+      "sampling/sampling_logp_difference/max": 7.687188148498535,
+      "sampling/sampling_logp_difference/mean": 0.01730487309396267,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 2.2348198399413377e-05,
+      "clip_ratio/high_mean": 6.557516371685779e-06,
+      "clip_ratio/low_mean": 5.170885208372056e-05,
+      "clip_ratio/low_min": 4.756469024869148e-06,
+      "clip_ratio/region_mean": 5.826636891015369e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15868.0,
+      "completions/mean_length": 6052.265625,
+      "completions/mean_terminated_length": 5888.27001953125,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "entropy": 0.9033217504620552,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0031849017832428217,
+      "learning_rate": 1e-05,
+      "loss": 0.0572,
+      "num_tokens": 272818080.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999919533729553,
+      "sampling/importance_sampling_ratio/min": 2.2380504560715053e-07,
+      "sampling/sampling_logp_difference/max": 15.312490463256836,
+      "sampling/sampling_logp_difference/mean": 0.019191090017557144,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 3.71780379282427e-06,
+      "clip_ratio/high_mean": 9.294509482060676e-07,
+      "clip_ratio/low_mean": 6.115805626905058e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.20875071035698e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16068.0,
+      "completions/max_terminated_length": 16068.0,
+      "completions/mean_length": 6337.5859375,
+      "completions/mean_terminated_length": 6337.5859375,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "entropy": 1.0558827072381973,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002086545340716839,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 273648579.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31276631355285645,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000107288360596,
+      "sampling/importance_sampling_ratio/min": 7.982287934282795e-05,
+      "sampling/sampling_logp_difference/max": 9.435700416564941,
+      "sampling/sampling_logp_difference/mean": 0.021268527954816818,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 1.228984365297947e-05,
+      "clip_ratio/high_mean": 3.0724609132448677e-06,
+      "clip_ratio/low_mean": 3.2620800709537434e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.56932616227823e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15556.0,
+      "completions/mean_length": 6439.78125,
+      "completions/mean_terminated_length": 6361.48046875,
+      "completions/min_length": 890.0,
+      "completions/min_terminated_length": 890.0,
+      "entropy": 0.989262692630291,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002226081909611821,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 274493159.0,
+      "reward": 0.3984375,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000194311141968,
+      "sampling/importance_sampling_ratio/min": 0.03169185668230057,
+      "sampling/sampling_logp_difference/max": 3.451695442199707,
+      "sampling/sampling_logp_difference/mean": 0.019788069650530815,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 7.10556764715875e-06,
+      "clip_ratio/high_mean": 1.7763919117896876e-06,
+      "clip_ratio/low_mean": 3.469589137239382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.647228299996641e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16050.0,
+      "completions/mean_length": 7641.5234375,
+      "completions/mean_terminated_length": 7572.68505859375,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 1.1427540630102158,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022452943958342075,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 275490762.0,
+      "reward": 0.203125,
+      "reward_std": 0.2567248046398163,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0003476575657259673,
+      "sampling/sampling_logp_difference/max": 7.964292526245117,
+      "sampling/sampling_logp_difference/mean": 0.022936880588531494,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 3.430955530348001e-06,
+      "clip_ratio/high_mean": 8.577388825870003e-07,
+      "clip_ratio/low_mean": 1.611294828762766e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6970687056527822e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15717.0,
+      "completions/mean_length": 6291.046875,
+      "completions/mean_terminated_length": 6211.57470703125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.1789169162511826,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.001387307420372963,
+      "learning_rate": 1e-05,
+      "loss": -0.0026,
+      "num_tokens": 276314904.0,
+      "reward": 0.28125,
+      "reward_std": 0.1712273508310318,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000487565994263,
+      "sampling/importance_sampling_ratio/min": 0.012205099686980247,
+      "sampling/sampling_logp_difference/max": 4.4059014320373535,
+      "sampling/sampling_logp_difference/mean": 0.020597899332642555,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 1.1513505342009012e-05,
+      "clip_ratio/high_mean": 2.878376335502253e-06,
+      "clip_ratio/low_mean": 5.239053416516981e-05,
+      "clip_ratio/low_min": 5.946967576164752e-06,
+      "clip_ratio/region_mean": 5.526891072804574e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15962.0,
+      "completions/mean_length": 7677.5,
+      "completions/mean_terminated_length": 7019.025390625,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.9808845967054367,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018187003443017602,
+      "learning_rate": 1e-05,
+      "loss": 0.0705,
+      "num_tokens": 277320888.0,
+      "reward": 0.25,
+      "reward_std": 0.2880108058452606,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999768733978271,
+      "sampling/importance_sampling_ratio/min": 0.0001234103983733803,
+      "sampling/sampling_logp_difference/max": 8.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.0210642758756876,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 1.7702866443869425e-05,
+      "clip_ratio/high_mean": 4.425716610967356e-06,
+      "clip_ratio/low_mean": 4.517976913120947e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.960548540111631e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15783.0,
+      "completions/mean_length": 7066.1484375,
+      "completions/mean_terminated_length": 6992.779296875,
+      "completions/min_length": 580.0,
+      "completions/min_terminated_length": 580.0,
+      "entropy": 1.0734655261039734,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019406796200200915,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 278245739.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29249146580696106,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.004089824389666319,
+      "sampling/sampling_logp_difference/max": 5.499253273010254,
+      "sampling/sampling_logp_difference/mean": 0.020316962152719498,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 1.661570968281012e-05,
+      "clip_ratio/high_mean": 5.1870877086912515e-06,
+      "clip_ratio/low_mean": 1.647002238769346e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.165711032375839e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14474.0,
+      "completions/max_terminated_length": 14474.0,
+      "completions/mean_length": 5187.5078125,
+      "completions/mean_terminated_length": 5187.5078125,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "entropy": 0.9958596602082253,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023044368717819452,
+      "learning_rate": 1e-05,
+      "loss": -0.002,
+      "num_tokens": 278933796.0,
+      "reward": 0.453125,
+      "reward_std": 0.22331714630126953,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999975562095642,
+      "sampling/importance_sampling_ratio/min": 1.0969968570861965e-05,
+      "sampling/sampling_logp_difference/max": 11.42034912109375,
+      "sampling/sampling_logp_difference/mean": 0.019379254430532455,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 1.5325686035794206e-05,
+      "clip_ratio/high_mean": 3.8314215089485515e-06,
+      "clip_ratio/low_mean": 2.3057583121044445e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.688900440261932e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15550.0,
+      "completions/mean_length": 6871.0859375,
+      "completions/mean_terminated_length": 6484.3818359375,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 0.8953125178813934,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026841885410249233,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 279832175.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3595392107963562,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001311302185,
+      "sampling/importance_sampling_ratio/min": 0.004663798026740551,
+      "sampling/sampling_logp_difference/max": 5.36792516708374,
+      "sampling/sampling_logp_difference/mean": 0.019127724692225456,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 1.315804820478661e-05,
+      "clip_ratio/high_mean": 4.150227596255718e-06,
+      "clip_ratio/low_mean": 3.6840762675183214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0990990044065256e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14255.0,
+      "completions/mean_length": 6459.2109375,
+      "completions/mean_terminated_length": 6381.06298828125,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8647114709019661,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0014444541884586215,
+      "learning_rate": 1e-05,
+      "loss": 0.0198,
+      "num_tokens": 280678482.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999886751174927,
+      "sampling/importance_sampling_ratio/min": 0.0019316815305501223,
+      "sampling/sampling_logp_difference/max": 6.249364376068115,
+      "sampling/sampling_logp_difference/mean": 0.01974722556769848,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.500776003624196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.500776003624196e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16081.0,
+      "completions/mean_length": 6280.0546875,
+      "completions/mean_terminated_length": 6037.56005859375,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "entropy": 0.9132707491517067,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001992191653698683,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 281499753.0,
+      "reward": 0.375,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999694228172302,
+      "sampling/importance_sampling_ratio/min": 2.558048436185345e-05,
+      "sampling/sampling_logp_difference/max": 10.573680877685547,
+      "sampling/sampling_logp_difference/mean": 0.01896769367158413,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.2855523436883232e-05,
+      "clip_ratio/high_mean": 3.213880859220808e-06,
+      "clip_ratio/low_mean": 2.9316923928490723e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2530804674024694e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16381.0,
+      "completions/mean_length": 6220.578125,
+      "completions/mean_terminated_length": 5892.7255859375,
+      "completions/min_length": 798.0,
+      "completions/min_terminated_length": 798.0,
+      "entropy": 0.8257150128483772,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003750045085325837,
+      "learning_rate": 1e-05,
+      "loss": 0.0631,
+      "num_tokens": 282316795.0,
+      "reward": 0.515625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999854564666748,
+      "sampling/importance_sampling_ratio/min": 2.2095075280503806e-07,
+      "sampling/sampling_logp_difference/max": 15.325325965881348,
+      "sampling/sampling_logp_difference/mean": 0.017498498782515526,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 9.090150342672132e-06,
+      "clip_ratio/high_mean": 2.272537585668033e-06,
+      "clip_ratio/low_mean": 5.6543332675573765e-05,
+      "clip_ratio/low_min": 4.705262199422577e-06,
+      "clip_ratio/region_mean": 5.881586980649445e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16134.0,
+      "completions/mean_length": 6845.09375,
+      "completions/mean_terminated_length": 6693.68310546875,
+      "completions/min_length": 29.0,
+      "completions/min_terminated_length": 29.0,
+      "entropy": 0.9700654074549675,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002124012913554907,
+      "learning_rate": 1e-05,
+      "loss": 0.0657,
+      "num_tokens": 283212095.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3527093529701233,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914169311523,
+      "sampling/importance_sampling_ratio/min": 4.450856749826926e-07,
+      "sampling/sampling_logp_difference/max": 14.624999046325684,
+      "sampling/sampling_logp_difference/mean": 0.02086886763572693,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 4.2354217839601915e-06,
+      "clip_ratio/high_mean": 1.0588554459900479e-06,
+      "clip_ratio/low_mean": 5.4464956633637485e-05,
+      "clip_ratio/low_min": 7.402143637591507e-06,
+      "clip_ratio/region_mean": 5.552381219331437e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15416.0,
+      "completions/max_terminated_length": 15416.0,
+      "completions/mean_length": 4986.3828125,
+      "completions/mean_terminated_length": 4986.3828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9103464111685753,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035143878776580095,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "num_tokens": 283871808.0,
+      "reward": 0.4296875,
+      "reward_std": 0.40715324878692627,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999771118164062,
+      "sampling/importance_sampling_ratio/min": 0.0028091762214899063,
+      "sampling/sampling_logp_difference/max": 5.874864101409912,
+      "sampling/sampling_logp_difference/mean": 0.01833461783826351,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 1.915729558277235e-05,
+      "clip_ratio/high_mean": 4.789323895693087e-06,
+      "clip_ratio/low_mean": 2.4886074015739723e-05,
+      "clip_ratio/low_min": 2.922677595051937e-06,
+      "clip_ratio/region_mean": 2.9675398081963067e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15954.0,
+      "completions/mean_length": 6467.9921875,
+      "completions/mean_terminated_length": 6310.595703125,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "entropy": 0.926672600209713,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0014899170491844416,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 284718943.0,
+      "reward": 0.390625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999134540557861,
+      "sampling/importance_sampling_ratio/min": 0.00027431987109594047,
+      "sampling/sampling_logp_difference/max": 8.201215744018555,
+      "sampling/sampling_logp_difference/mean": 0.01909649185836315,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.792281761936465e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.792281761936465e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15890.0,
+      "completions/mean_length": 6009.3671875,
+      "completions/mean_terminated_length": 5927.67724609375,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "entropy": 1.0197014585137367,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001638311194255948,
+      "learning_rate": 1e-05,
+      "loss": 0.0342,
+      "num_tokens": 285507622.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998466968536377,
+      "sampling/importance_sampling_ratio/min": 2.144540849258192e-05,
+      "sampling/sampling_logp_difference/max": 10.75,
+      "sampling/sampling_logp_difference/mean": 0.0198800191283226,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 1.3140848295734031e-05,
+      "clip_ratio/high_mean": 3.2852120739335078e-06,
+      "clip_ratio/low_mean": 5.1451362480747775e-05,
+      "clip_ratio/low_min": 7.097433353919769e-06,
+      "clip_ratio/region_mean": 5.473657506627205e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15174.0,
+      "completions/max_terminated_length": 15174.0,
+      "completions/mean_length": 6360.421875,
+      "completions/mean_terminated_length": 6360.421875,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.9253586605191231,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017278637969866395,
+      "learning_rate": 1e-05,
+      "loss": 0.0638,
+      "num_tokens": 286341012.0,
+      "reward": 0.390625,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998660087585449,
+      "sampling/importance_sampling_ratio/min": 5.007527215639129e-05,
+      "sampling/sampling_logp_difference/max": 9.901983261108398,
+      "sampling/sampling_logp_difference/mean": 0.02024514600634575,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 2.1974663468427025e-05,
+      "clip_ratio/high_mean": 6.800322353228694e-06,
+      "clip_ratio/low_mean": 3.598067922894188e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.27810022642916e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16158.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 5470.5234375,
+      "completions/mean_terminated_length": 5470.5234375,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.9031187370419502,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00106104149017483,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 287065039.0,
+      "reward": 0.3828125,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999252557754517,
+      "sampling/importance_sampling_ratio/min": 1.6605448536211043e-06,
+      "sampling/sampling_logp_difference/max": 13.308364868164062,
+      "sampling/sampling_logp_difference/mean": 0.018382512032985687,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.3466772088577272e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.3466772088577272e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15190.0,
+      "completions/max_terminated_length": 15190.0,
+      "completions/mean_length": 5533.265625,
+      "completions/mean_terminated_length": 5533.265625,
+      "completions/min_length": 109.0,
+      "completions/min_terminated_length": 109.0,
+      "entropy": 1.0052079856395721,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033145309425890446,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 287793249.0,
+      "reward": 0.484375,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999661445617676,
+      "sampling/importance_sampling_ratio/min": 0.04231228679418564,
+      "sampling/sampling_logp_difference/max": 3.162677764892578,
+      "sampling/sampling_logp_difference/mean": 0.020278627052903175,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 3.310516694909893e-05,
+      "clip_ratio/high_mean": 8.276291737274732e-06,
+      "clip_ratio/low_mean": 3.8735864336558734e-05,
+      "clip_ratio/low_min": 3.0842873002256965e-06,
+      "clip_ratio/region_mean": 4.7012156073833467e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 6025.6796875,
+      "completions/mean_terminated_length": 5604.609375,
+      "completions/min_length": 583.0,
+      "completions/min_terminated_length": 583.0,
+      "entropy": 0.8798701837658882,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023973146453499794,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 288582232.0,
+      "reward": 0.453125,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998465776443481,
+      "sampling/importance_sampling_ratio/min": 5.531576334760757e-06,
+      "sampling/sampling_logp_difference/max": 12.105037689208984,
+      "sampling/sampling_logp_difference/mean": 0.01999252662062645,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 1.2754688668792369e-05,
+      "clip_ratio/high_mean": 4.434933430275123e-06,
+      "clip_ratio/low_mean": 2.503601820080803e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.947095174476999e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14890.0,
+      "completions/mean_length": 6893.5390625,
+      "completions/mean_terminated_length": 6818.81103515625,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.8881499394774437,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016761437291279435,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 289483997.0,
+      "reward": 0.3515625,
+      "reward_std": 0.26143792271614075,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00004243850708,
+      "sampling/importance_sampling_ratio/min": 4.540014560916461e-05,
+      "sampling/sampling_logp_difference/max": 9.999995231628418,
+      "sampling/sampling_logp_difference/mean": 0.019294647499918938,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.8526947997088428e-05,
+      "clip_ratio/high_mean": 4.631736999272107e-06,
+      "clip_ratio/low_mean": 4.962505795447214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.425679569270869e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 6087.828125,
+      "completions/mean_terminated_length": 6006.755859375,
+      "completions/min_length": 608.0,
+      "completions/min_terminated_length": 608.0,
+      "entropy": 0.8525711894035339,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.002270620781928301,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 290282639.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3645517826080322,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999156594276428,
+      "sampling/importance_sampling_ratio/min": 0.0006376233650371432,
+      "sampling/sampling_logp_difference/max": 7.357762813568115,
+      "sampling/sampling_logp_difference/mean": 0.01862185075879097,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 1.1926310435228515e-05,
+      "clip_ratio/high_mean": 2.981577608807129e-06,
+      "clip_ratio/low_mean": 5.369399366372818e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6675571954656334e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15920.0,
+      "completions/mean_length": 7951.0,
+      "completions/mean_terminated_length": 7678.96728515625,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "entropy": 0.9653833135962486,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0013396133435890079,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 291320703.0,
+      "reward": 0.375,
+      "reward_std": 0.3429914712905884,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 2.4461383873131126e-05,
+      "sampling/sampling_logp_difference/max": 10.618414878845215,
+      "sampling/sampling_logp_difference/mean": 0.0205213762819767,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.886164773168275e-05,
+      "clip_ratio/high_mean": 4.715411932920688e-06,
+      "clip_ratio/low_mean": 4.581529401548323e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0530706175777595e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6017.2578125,
+      "completions/mean_terminated_length": 5852.70654296875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9492783322930336,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003194117220118642,
+      "learning_rate": 1e-05,
+      "loss": 0.0868,
+      "num_tokens": 292113384.0,
+      "reward": 0.5703125,
+      "reward_std": 0.36743485927581787,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999614357948303,
+      "sampling/importance_sampling_ratio/min": 0.004017275292426348,
+      "sampling/sampling_logp_difference/max": 5.517151355743408,
+      "sampling/sampling_logp_difference/mean": 0.02062429115176201,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 1.4877897228871007e-05,
+      "clip_ratio/high_mean": 3.7194743072177516e-06,
+      "clip_ratio/low_mean": 3.613741432673123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.985688817920163e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15690.0,
+      "completions/mean_length": 6696.0,
+      "completions/mean_terminated_length": 6619.71630859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 1.0417355075478554,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001876713940873742,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 292990600.0,
+      "reward": 0.34375,
+      "reward_std": 0.28011518716812134,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998572468757629,
+      "sampling/importance_sampling_ratio/min": 3.398728586034849e-05,
+      "sampling/sampling_logp_difference/max": 10.28952407836914,
+      "sampling/sampling_logp_difference/mean": 0.020289337262511253,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8955274046893464e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8955274046893464e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14436.0,
+      "completions/mean_length": 5184.203125,
+      "completions/mean_terminated_length": 5096.015625,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "entropy": 1.0320965945720673,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002229714998975396,
+      "learning_rate": 1e-05,
+      "loss": 0.0351,
+      "num_tokens": 293673106.0,
+      "reward": 0.375,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000356435775757,
+      "sampling/importance_sampling_ratio/min": 5.736888851970434e-05,
+      "sampling/sampling_logp_difference/max": 9.766008377075195,
+      "sampling/sampling_logp_difference/mean": 0.01969832368195057,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 1.2176971722510643e-05,
+      "clip_ratio/high_mean": 3.044242930627661e-06,
+      "clip_ratio/low_mean": 4.728799405029349e-05,
+      "clip_ratio/low_min": 5.63901312489179e-06,
+      "clip_ratio/region_mean": 5.033223698092115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15582.0,
+      "completions/mean_length": 6664.2890625,
+      "completions/mean_terminated_length": 6510.00830078125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "entropy": 0.8329441174864769,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001597537542693317,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 294545927.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000386238098145,
+      "sampling/importance_sampling_ratio/min": 0.00012341710680630058,
+      "sampling/sampling_logp_difference/max": 8.999940872192383,
+      "sampling/sampling_logp_difference/mean": 0.018238451331853867,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 3.2730224575061584e-06,
+      "clip_ratio/high_mean": 8.182556143765396e-07,
+      "clip_ratio/low_mean": 5.867890376975993e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.94971597820404e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16322.0,
+      "completions/mean_length": 7486.4921875,
+      "completions/mean_terminated_length": 7345.26220703125,
+      "completions/min_length": 952.0,
+      "completions/min_terminated_length": 952.0,
+      "entropy": 1.0071435943245888,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0018223393708467484,
+      "learning_rate": 1e-05,
+      "loss": 0.1035,
+      "num_tokens": 295523558.0,
+      "reward": 0.359375,
+      "reward_std": 0.36561262607574463,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999614357948303,
+      "sampling/importance_sampling_ratio/min": 8.459773198410403e-06,
+      "sampling/sampling_logp_difference/max": 11.680188179016113,
+      "sampling/sampling_logp_difference/mean": 0.021324433386325836,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 1.9864856540152687e-05,
+      "clip_ratio/high_mean": 4.966214135038172e-06,
+      "clip_ratio/low_mean": 4.498222278925823e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.994843698113982e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14737.0,
+      "completions/mean_length": 6103.015625,
+      "completions/mean_terminated_length": 6022.06298828125,
+      "completions/min_length": 573.0,
+      "completions/min_terminated_length": 573.0,
+      "entropy": 0.9639975428581238,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002672795206308365,
+      "learning_rate": 1e-05,
+      "loss": 0.0559,
+      "num_tokens": 296323888.0,
+      "reward": 0.375,
+      "reward_std": 0.32589420676231384,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998803734779358,
+      "sampling/importance_sampling_ratio/min": 0.0057671889662742615,
+      "sampling/sampling_logp_difference/max": 5.1555705070495605,
+      "sampling/sampling_logp_difference/mean": 0.019866492599248886,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 1.1948508017667336e-05,
+      "clip_ratio/high_mean": 2.987127004416834e-06,
+      "clip_ratio/low_mean": 4.0038267286490736e-05,
+      "clip_ratio/low_min": 3.0986614092398668e-06,
+      "clip_ratio/region_mean": 4.302539394984706e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15805.0,
+      "completions/mean_length": 6524.640625,
+      "completions/mean_terminated_length": 6368.14306640625,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.8653942495584488,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016479750629514456,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 297179234.0,
+      "reward": 0.46875,
+      "reward_std": 0.28011518716812134,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 0.0009119793539866805,
+      "sampling/sampling_logp_difference/max": 6.9998931884765625,
+      "sampling/sampling_logp_difference/mean": 0.018908966332674026,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 7.669039405300282e-06,
+      "clip_ratio/high_mean": 1.9172598513250705e-06,
+      "clip_ratio/low_mean": 2.1955054876343638e-05,
+      "clip_ratio/low_min": 3.4466595479898388e-06,
+      "clip_ratio/region_mean": 2.387231518241606e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16294.0,
+      "completions/mean_length": 8057.3203125,
+      "completions/mean_terminated_length": 7857.48046875,
+      "completions/min_length": 637.0,
+      "completions/min_terminated_length": 637.0,
+      "entropy": 1.0029005706310272,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018210343550890684,
+      "learning_rate": 1e-05,
+      "loss": 0.0309,
+      "num_tokens": 298230699.0,
+      "reward": 0.25,
+      "reward_std": 0.19438526034355164,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999086856842041,
+      "sampling/importance_sampling_ratio/min": 0.0046700225211679935,
+      "sampling/sampling_logp_difference/max": 5.366591453552246,
+      "sampling/sampling_logp_difference/mean": 0.020166225731372833,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 6.953715910640312e-06,
+      "clip_ratio/high_mean": 1.738428977660078e-06,
+      "clip_ratio/low_mean": 2.961834002235264e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1356769113699556e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 6875.3125,
+      "completions/mean_terminated_length": 6647.1044921875,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8582051023840904,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021944146137684584,
+      "learning_rate": 1e-05,
+      "loss": 0.072,
+      "num_tokens": 299131579.0,
+      "reward": 0.4375,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999915361404419,
+      "sampling/importance_sampling_ratio/min": 5.424213668447919e-06,
+      "sampling/sampling_logp_difference/max": 12.124637603759766,
+      "sampling/sampling_logp_difference/mean": 0.018997181206941605,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.4359977967615123e-05,
+      "clip_ratio/high_mean": 5.290952628911327e-06,
+      "clip_ratio/low_mean": 1.991117466104697e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5202126892054366e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16093.0,
+      "completions/mean_length": 7046.46875,
+      "completions/mean_terminated_length": 6745.2578125,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8899112716317177,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021380677353590727,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 300051471.0,
+      "reward": 0.390625,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000321865081787,
+      "sampling/importance_sampling_ratio/min": 0.00043609709246084094,
+      "sampling/sampling_logp_difference/max": 7.737645626068115,
+      "sampling/sampling_logp_difference/mean": 0.018849756568670273,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 1.1736750366253546e-05,
+      "clip_ratio/high_mean": 2.9341875915633864e-06,
+      "clip_ratio/low_mean": 2.6090394442235265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.902458214748549e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14683.0,
+      "completions/mean_length": 7227.8203125,
+      "completions/mean_terminated_length": 7008.072265625,
+      "completions/min_length": 869.0,
+      "completions/min_terminated_length": 869.0,
+      "entropy": 0.9667621031403542,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001994286896660924,
+      "learning_rate": 1e-05,
+      "loss": 0.0231,
+      "num_tokens": 300994584.0,
+      "reward": 0.4296875,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000085830688477,
+      "sampling/importance_sampling_ratio/min": 0.005131956655532122,
+      "sampling/sampling_logp_difference/max": 5.272268295288086,
+      "sampling/sampling_logp_difference/mean": 0.019861025735735893,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 5.608902483800193e-06,
+      "clip_ratio/high_mean": 1.4022256209500483e-06,
+      "clip_ratio/low_mean": 1.2587312312462018e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.3989537819725228e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16342.0,
+      "completions/mean_length": 6763.484375,
+      "completions/mean_terminated_length": 6372.40625,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "entropy": 0.9238758087158203,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019569231662899256,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 301878446.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504647254944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999585151672363,
+      "sampling/importance_sampling_ratio/min": 6.425123189046644e-08,
+      "sampling/sampling_logp_difference/max": 16.56046485900879,
+      "sampling/sampling_logp_difference/mean": 0.019518161192536354,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 4.044129582325695e-06,
+      "clip_ratio/high_mean": 1.0110323955814238e-06,
+      "clip_ratio/low_mean": 3.2966671312806284e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3977703822074545e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16018.0,
+      "completions/max_terminated_length": 16018.0,
+      "completions/mean_length": 6098.703125,
+      "completions/mean_terminated_length": 6098.703125,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "entropy": 0.7785998061299324,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0024868762120604515,
+      "learning_rate": 1e-05,
+      "loss": 0.0405,
+      "num_tokens": 302677272.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999961853027344,
+      "sampling/importance_sampling_ratio/min": 0.003617732785642147,
+      "sampling/sampling_logp_difference/max": 5.621907711029053,
+      "sampling/sampling_logp_difference/mean": 0.017242450267076492,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.291554517341865e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.291554517341865e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15935.0,
+      "completions/mean_length": 6799.1875,
+      "completions/mean_terminated_length": 6569.15234375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 0.8998014703392982,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0017842436209321022,
+      "learning_rate": 1e-05,
+      "loss": 0.0286,
+      "num_tokens": 303565408.0,
+      "reward": 0.3046875,
+      "reward_std": 0.17806214094161987,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000123977661133,
+      "sampling/importance_sampling_ratio/min": 0.002333547454327345,
+      "sampling/sampling_logp_difference/max": 6.060365676879883,
+      "sampling/sampling_logp_difference/mean": 0.01987488754093647,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 2.6103274649358355e-05,
+      "clip_ratio/high_mean": 7.854475143176387e-06,
+      "clip_ratio/low_mean": 5.6201750339823775e-05,
+      "clip_ratio/low_min": 6.543817562487675e-06,
+      "clip_ratio/region_mean": 6.405622525562649e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15338.0,
+      "completions/mean_length": 5483.4140625,
+      "completions/mean_terminated_length": 5131.7822265625,
+      "completions/min_length": 526.0,
+      "completions/min_terminated_length": 526.0,
+      "entropy": 0.8604720532894135,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004101207479834557,
+      "learning_rate": 1e-05,
+      "loss": 0.083,
+      "num_tokens": 304283925.0,
+      "reward": 0.4375,
+      "reward_std": 0.3174794614315033,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999923825263977,
+      "sampling/importance_sampling_ratio/min": 8.628804062027484e-05,
+      "sampling/sampling_logp_difference/max": 9.357819557189941,
+      "sampling/sampling_logp_difference/mean": 0.018733445554971695,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 8.375103107027826e-06,
+      "clip_ratio/high_mean": 2.0937757767569565e-06,
+      "clip_ratio/low_mean": 4.883176779912901e-05,
+      "clip_ratio/low_min": 7.539494390584878e-06,
+      "clip_ratio/region_mean": 5.092554329166887e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7857.9140625,
+      "completions/mean_terminated_length": 7722.57958984375,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "entropy": 0.9493537694215775,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025712712667882442,
+      "learning_rate": 1e-05,
+      "loss": 0.011,
+      "num_tokens": 305311730.0,
+      "reward": 0.3125,
+      "reward_std": 0.3227166533470154,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999322295188904,
+      "sampling/importance_sampling_ratio/min": 0.00010902724170591682,
+      "sampling/sampling_logp_difference/max": 9.123912811279297,
+      "sampling/sampling_logp_difference/mean": 0.020730353891849518,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 1.7927761746250326e-05,
+      "clip_ratio/high_mean": 4.4819404365625815e-06,
+      "clip_ratio/low_mean": 1.4648778403625329e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.913071884018791e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14578.0,
+      "completions/mean_length": 6591.28125,
+      "completions/mean_terminated_length": 6514.17333984375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.8540837243199348,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001778970006853342,
+      "learning_rate": 1e-05,
+      "loss": 0.0552,
+      "num_tokens": 306172870.0,
+      "reward": 0.53125,
+      "reward_std": 0.25855979323387146,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999608397483826,
+      "sampling/importance_sampling_ratio/min": 0.005589231848716736,
+      "sampling/sampling_logp_difference/max": 5.18691349029541,
+      "sampling/sampling_logp_difference/mean": 0.018087508156895638,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 1.5696539094278705e-05,
+      "clip_ratio/high_mean": 3.924134773569676e-06,
+      "clip_ratio/low_mean": 4.2228432448609965e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.615256762008357e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 7443.5859375,
+      "completions/mean_terminated_length": 7301.6748046875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 1.1251945495605469,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024547462817281485,
+      "learning_rate": 1e-05,
+      "loss": -0.0017,
+      "num_tokens": 307145857.0,
+      "reward": 0.2734375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000030994415283,
+      "sampling/importance_sampling_ratio/min": 0.0008770838030613959,
+      "sampling/sampling_logp_difference/max": 7.038908004760742,
+      "sampling/sampling_logp_difference/mean": 0.021768298000097275,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 7.035515409370419e-06,
+      "clip_ratio/high_mean": 1.7588788523426047e-06,
+      "clip_ratio/low_mean": 2.2691801063956518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4450679802612285e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14811.0,
+      "completions/max_terminated_length": 14811.0,
+      "completions/mean_length": 6497.890625,
+      "completions/mean_terminated_length": 6497.890625,
+      "completions/min_length": 1079.0,
+      "completions/min_terminated_length": 1079.0,
+      "entropy": 1.0804385766386986,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003075090004131198,
+      "learning_rate": 1e-05,
+      "loss": 0.012,
+      "num_tokens": 307998003.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20753081142902374,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999311566352844,
+      "sampling/importance_sampling_ratio/min": 0.0032886455301195383,
+      "sampling/sampling_logp_difference/max": 5.717279434204102,
+      "sampling/sampling_logp_difference/mean": 0.021208221092820168,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 1.0550694696576102e-05,
+      "clip_ratio/high_mean": 3.640079512479133e-06,
+      "clip_ratio/low_mean": 3.440372779550671e-05,
+      "clip_ratio/low_min": 4.334107870818116e-06,
+      "clip_ratio/region_mean": 3.804380708061217e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16155.0,
+      "completions/mean_length": 7146.578125,
+      "completions/mean_terminated_length": 6692.2783203125,
+      "completions/min_length": 1089.0,
+      "completions/min_terminated_length": 1089.0,
+      "entropy": 0.900071032345295,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023383013904094696,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 308930389.0,
+      "reward": 0.453125,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000137090682983,
+      "sampling/importance_sampling_ratio/min": 0.003526465967297554,
+      "sampling/sampling_logp_difference/max": 5.647459030151367,
+      "sampling/sampling_logp_difference/mean": 0.019267898052930832,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 2.1745769345216104e-05,
+      "clip_ratio/high_mean": 6.434876752337004e-06,
+      "clip_ratio/low_mean": 3.9315604908551904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5750481831419165e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14293.0,
+      "completions/mean_length": 6189.109375,
+      "completions/mean_terminated_length": 6108.83447265625,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 0.9284940734505653,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018437084509059787,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 309741419.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3050953149795532,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000801086425781,
+      "sampling/importance_sampling_ratio/min": 4.7444238589378074e-05,
+      "sampling/sampling_logp_difference/max": 9.955955505371094,
+      "sampling/sampling_logp_difference/mean": 0.019703445956110954,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 1.630432370802737e-05,
+      "clip_ratio/high_mean": 4.076080927006842e-06,
+      "clip_ratio/low_mean": 3.713273554240004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1208816355720046e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15556.0,
+      "completions/mean_length": 5456.7421875,
+      "completions/mean_terminated_length": 5194.48828125,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "entropy": 0.9236080572009087,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030215675942599773,
+      "learning_rate": 1e-05,
+      "loss": 0.0431,
+      "num_tokens": 310458386.0,
+      "reward": 0.46875,
+      "reward_std": 0.30168038606643677,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 0.00015846964379306883,
+      "sampling/sampling_logp_difference/max": 8.749947547912598,
+      "sampling/sampling_logp_difference/mean": 0.01910843700170517,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 2.3289825548999943e-05,
+      "clip_ratio/high_mean": 5.822456387249986e-06,
+      "clip_ratio/low_mean": 3.062871041947801e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.645116612460697e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15118.0,
+      "completions/mean_length": 6246.25,
+      "completions/mean_terminated_length": 6085.33349609375,
+      "completions/min_length": 514.0,
+      "completions/min_terminated_length": 514.0,
+      "entropy": 1.0128052979707718,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002812379039824009,
+      "learning_rate": 1e-05,
+      "loss": 0.0117,
+      "num_tokens": 311279114.0,
+      "reward": 0.390625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999204277992249,
+      "sampling/importance_sampling_ratio/min": 0.0007136549684219062,
+      "sampling/sampling_logp_difference/max": 7.245110988616943,
+      "sampling/sampling_logp_difference/mean": 0.02073795720934868,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 1.566006790199026e-05,
+      "clip_ratio/high_mean": 3.915016975497565e-06,
+      "clip_ratio/low_mean": 1.4384278813395213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.829929567520594e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15893.0,
+      "completions/mean_length": 7661.859375,
+      "completions/mean_terminated_length": 7452.5283203125,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 0.9746306762099266,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018165848450735211,
+      "learning_rate": 1e-05,
+      "loss": 0.0255,
+      "num_tokens": 312280648.0,
+      "reward": 0.3984375,
+      "reward_std": 0.15991678833961487,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999211430549622,
+      "sampling/importance_sampling_ratio/min": 2.2834767150925472e-05,
+      "sampling/sampling_logp_difference/max": 10.687226295471191,
+      "sampling/sampling_logp_difference/mean": 0.02064785361289978,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 6.112351002229843e-06,
+      "clip_ratio/high_mean": 1.5280877505574608e-06,
+      "clip_ratio/low_mean": 1.7822256495492184e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9350344246049644e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15283.0,
+      "completions/mean_length": 6575.921875,
+      "completions/mean_terminated_length": 6498.69287109375,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "entropy": 1.0576276555657387,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0009623004007153213,
+      "learning_rate": 1e-05,
+      "loss": -0.0131,
+      "num_tokens": 313142142.0,
+      "reward": 0.296875,
+      "reward_std": 0.17176413536071777,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999088048934937,
+      "sampling/importance_sampling_ratio/min": 0.00010695109085645527,
+      "sampling/sampling_logp_difference/max": 9.143138885498047,
+      "sampling/sampling_logp_difference/mean": 0.02001393586397171,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 2.1532956907321932e-05,
+      "clip_ratio/high_mean": 7.117228278730181e-06,
+      "clip_ratio/low_mean": 4.647828791348729e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359551732908585e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16045.0,
+      "completions/mean_length": 7349.8203125,
+      "completions/mean_terminated_length": 7133.00048828125,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "entropy": 0.9633770063519478,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016735537210479379,
+      "learning_rate": 1e-05,
+      "loss": 0.0769,
+      "num_tokens": 314106551.0,
+      "reward": 0.3125,
+      "reward_std": 0.27670514583587646,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0006543444469571114,
+      "sampling/sampling_logp_difference/max": 7.331876754760742,
+      "sampling/sampling_logp_difference/mean": 0.01907072216272354,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 1.9804372868748032e-05,
+      "clip_ratio/high_mean": 4.951093217187008e-06,
+      "clip_ratio/low_mean": 2.807680073146912e-05,
+      "clip_ratio/low_min": 3.144654101561173e-06,
+      "clip_ratio/region_mean": 3.302789434656006e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16343.0,
+      "completions/mean_length": 7472.6640625,
+      "completions/mean_terminated_length": 7402.49609375,
+      "completions/min_length": 942.0,
+      "completions/min_terminated_length": 942.0,
+      "entropy": 1.0234674662351608,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0029567319434136152,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 315081020.0,
+      "reward": 0.328125,
+      "reward_std": 0.1841355264186859,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999366998672485,
+      "sampling/importance_sampling_ratio/min": 1.3551310985349119e-05,
+      "sampling/sampling_logp_difference/max": 11.209027290344238,
+      "sampling/sampling_logp_difference/mean": 0.020730063319206238,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 2.2943146859688568e-05,
+      "clip_ratio/high_mean": 6.9194542788864055e-06,
+      "clip_ratio/low_mean": 3.046788117444521e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.738733437330666e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16302.0,
+      "completions/mean_length": 7663.28125,
+      "completions/mean_terminated_length": 7234.39306640625,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "entropy": 0.989475853741169,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002559094922617078,
+      "learning_rate": 1e-05,
+      "loss": 0.002,
+      "num_tokens": 316083520.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3227117359638214,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 0.003966364543884993,
+      "sampling/sampling_logp_difference/max": 5.529905319213867,
+      "sampling/sampling_logp_difference/mean": 0.02191789261996746,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 1.007244372885907e-05,
+      "clip_ratio/high_mean": 2.5181109322147677e-06,
+      "clip_ratio/low_mean": 4.157553627237576e-05,
+      "clip_ratio/low_min": 7.249949248944176e-06,
+      "clip_ratio/region_mean": 4.4093647659337876e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15227.0,
+      "completions/mean_length": 6828.703125,
+      "completions/mean_terminated_length": 6440.2763671875,
+      "completions/min_length": 799.0,
+      "completions/min_terminated_length": 799.0,
+      "entropy": 0.9493783265352249,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001576121780090034,
+      "learning_rate": 1e-05,
+      "loss": 0.0414,
+      "num_tokens": 316982154.0,
+      "reward": 0.4375,
+      "reward_std": 0.25726157426834106,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999561309814453,
+      "sampling/importance_sampling_ratio/min": 0.002232425380498171,
+      "sampling/sampling_logp_difference/max": 6.104666709899902,
+      "sampling/sampling_logp_difference/mean": 0.020356670022010803,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 4.308265033614589e-06,
+      "clip_ratio/high_mean": 1.0770662584036472e-06,
+      "clip_ratio/low_mean": 3.2841844813447096e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.391891118553758e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15194.0,
+      "completions/mean_length": 6555.2890625,
+      "completions/mean_terminated_length": 5986.685546875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.9516563713550568,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002562758279964328,
+      "learning_rate": 1e-05,
+      "loss": -0.0459,
+      "num_tokens": 317841415.0,
+      "reward": 0.2734375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999120831489563,
+      "sampling/importance_sampling_ratio/min": 5.153654274181463e-05,
+      "sampling/sampling_logp_difference/max": 9.87321949005127,
+      "sampling/sampling_logp_difference/mean": 0.019885078072547913,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 1.579595573275583e-05,
+      "clip_ratio/high_mean": 3.948988933188957e-06,
+      "clip_ratio/low_mean": 5.6516228141845204e-05,
+      "clip_ratio/low_min": 1.2799536079910467e-05,
+      "clip_ratio/region_mean": 6.046521548341843e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 8033.5625,
+      "completions/mean_terminated_length": 7764.193359375,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "entropy": 1.0841791555285454,
+      "epoch": 0.35418583256669733,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0015623728977516294,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 318892079.0,
+      "reward": 0.234375,
+      "reward_std": 0.26249873638153076,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999390840530396,
+      "sampling/importance_sampling_ratio/min": 0.0027189957909286022,
+      "sampling/sampling_logp_difference/max": 5.907492637634277,
+      "sampling/sampling_logp_difference/mean": 0.022173013538122177,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 1.592646640347084e-05,
+      "clip_ratio/high_mean": 3.98161660086771e-06,
+      "clip_ratio/low_mean": 3.5816001627608784e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.979761731898179e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14983.0,
+      "completions/mean_length": 6105.0390625,
+      "completions/mean_terminated_length": 6024.1025390625,
+      "completions/min_length": 1010.0,
+      "completions/min_terminated_length": 1010.0,
+      "entropy": 0.7882698476314545,
+      "epoch": 0.35510579576816925,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0015339057426899672,
+      "learning_rate": 1e-05,
+      "loss": 0.0568,
+      "num_tokens": 319692740.0,
+      "reward": 0.5625,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999640583992004,
+      "sampling/importance_sampling_ratio/min": 0.005946483928710222,
+      "sampling/sampling_logp_difference/max": 5.124955177307129,
+      "sampling/sampling_logp_difference/mean": 0.017854198813438416,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 3.630976607382763e-06,
+      "clip_ratio/high_mean": 9.077441518456908e-07,
+      "clip_ratio/low_mean": 2.5168051195123553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6075795346969244e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14242.0,
+      "completions/max_terminated_length": 14242.0,
+      "completions/mean_length": 7078.359375,
+      "completions/mean_terminated_length": 7078.359375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 1.0915816724300385,
+      "epoch": 0.3560257589696412,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.000674036389682442,
+      "learning_rate": 1e-05,
+      "loss": 0.0477,
+      "num_tokens": 320618618.0,
+      "reward": 0.375,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999241828918457,
+      "sampling/importance_sampling_ratio/min": 0.012588412500917912,
+      "sampling/sampling_logp_difference/max": 4.374978542327881,
+      "sampling/sampling_logp_difference/mean": 0.021491196006536484,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 2.3060737021296518e-05,
+      "clip_ratio/high_mean": 8.880587984094745e-06,
+      "clip_ratio/low_mean": 4.042122702685447e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.930181512463605e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15486.0,
+      "completions/mean_length": 7647.6875,
+      "completions/mean_terminated_length": 7065.26708984375,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.8284596502780914,
+      "epoch": 0.35694572217111314,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001767225214280188,
+      "learning_rate": 1e-05,
+      "loss": 0.0847,
+      "num_tokens": 321617138.0,
+      "reward": 0.4765625,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999255537986755,
+      "sampling/importance_sampling_ratio/min": 0.0026657104026526213,
+      "sampling/sampling_logp_difference/max": 5.9272847175598145,
+      "sampling/sampling_logp_difference/mean": 0.018413839861750603,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 9.76903538685292e-06,
+      "clip_ratio/high_mean": 3.700462343658728e-06,
+      "clip_ratio/low_mean": 2.6322781820908858e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0023243880350492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14298.0,
+      "completions/mean_length": 6616.8984375,
+      "completions/mean_terminated_length": 6461.865234375,
+      "completions/min_length": 981.0,
+      "completions/min_terminated_length": 981.0,
+      "entropy": 0.9324140176177025,
+      "epoch": 0.3578656853725851,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0007780150044709444,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 322482213.0,
+      "reward": 0.5078125,
+      "reward_std": 0.19332444667816162,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999249577522278,
+      "sampling/importance_sampling_ratio/min": 8.851349093674798e-07,
+      "sampling/sampling_logp_difference/max": 13.937525749206543,
+      "sampling/sampling_logp_difference/mean": 0.019632574170827866,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.183885348154945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.183885348154945e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15911.0,
+      "completions/mean_length": 6376.375,
+      "completions/mean_terminated_length": 6297.57470703125,
+      "completions/min_length": 715.0,
+      "completions/min_terminated_length": 715.0,
+      "entropy": 1.0122736915946007,
+      "epoch": 0.35878564857405704,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.00017013182514347136,
+      "learning_rate": 1e-05,
+      "loss": 0.0068,
+      "num_tokens": 323316413.0,
+      "reward": 0.484375,
+      "reward_std": 0.1173202246427536,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999897480010986,
+      "sampling/importance_sampling_ratio/min": 0.001820300007238984,
+      "sampling/sampling_logp_difference/max": 6.308753967285156,
+      "sampling/sampling_logp_difference/mean": 0.020268389955163002,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 1.2158910067228135e-05,
+      "clip_ratio/high_mean": 4.907883408122871e-06,
+      "clip_ratio/low_mean": 3.3955970252463885e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.886385343321308e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 7434.703125,
+      "completions/mean_terminated_length": 7364.236328125,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 1.056224174797535,
+      "epoch": 0.35970561177552896,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0019504460506141186,
+      "learning_rate": 1e-05,
+      "loss": 0.0176,
+      "num_tokens": 324289663.0,
+      "reward": 0.3046875,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999295473098755,
+      "sampling/importance_sampling_ratio/min": 0.0005411410820670426,
+      "sampling/sampling_logp_difference/max": 7.5218305587768555,
+      "sampling/sampling_logp_difference/mean": 0.021627606824040413,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 2.5075807570829056e-05,
+      "clip_ratio/high_mean": 7.3508283549017506e-06,
+      "clip_ratio/low_mean": 3.88432285944873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.619405763151008e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 6783.9140625,
+      "completions/mean_terminated_length": 6708.32275390625,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.9994921758770943,
+      "epoch": 0.36062557497700093,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003350428305566311,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 325174860.0,
+      "reward": 0.40625,
+      "reward_std": 0.33797895908355713,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999490976333618,
+      "sampling/importance_sampling_ratio/min": 0.0019297851249575615,
+      "sampling/sampling_logp_difference/max": 6.250346660614014,
+      "sampling/sampling_logp_difference/mean": 0.02060745656490326,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 5.086883902549744e-06,
+      "clip_ratio/high_mean": 2.125662831531372e-06,
+      "clip_ratio/low_mean": 3.603865525292349e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.816431808445486e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15520.0,
+      "completions/mean_length": 6797.28125,
+      "completions/mean_terminated_length": 6645.111328125,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "entropy": 0.9564928039908409,
+      "epoch": 0.36154553817847285,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030228395480662584,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 326065824.0,
+      "reward": 0.46875,
+      "reward_std": 0.27722427248954773,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999678134918213,
+      "sampling/importance_sampling_ratio/min": 1.927352604980115e-05,
+      "sampling/sampling_logp_difference/max": 10.856778144836426,
+      "sampling/sampling_logp_difference/mean": 0.020122073590755463,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 8.678096946823644e-06,
+      "clip_ratio/high_mean": 2.169524236705911e-06,
+      "clip_ratio/low_mean": 2.1449313862831332e-05,
+      "clip_ratio/low_min": 3.5140985801263014e-06,
+      "clip_ratio/region_mean": 2.361883775847673e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15744.0,
+      "completions/mean_length": 7028.4765625,
+      "completions/mean_terminated_length": 6954.81103515625,
+      "completions/min_length": 920.0,
+      "completions/min_terminated_length": 920.0,
+      "entropy": 0.9178477674722672,
+      "epoch": 0.3624655013799448,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027565474156290293,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 326985805.0,
+      "reward": 0.40625,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999858736991882,
+      "sampling/importance_sampling_ratio/min": 0.003095855936408043,
+      "sampling/sampling_logp_difference/max": 5.777690887451172,
+      "sampling/sampling_logp_difference/mean": 0.019194945693016052,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 1.1162969258293742e-05,
+      "clip_ratio/high_mean": 2.7907423145734356e-06,
+      "clip_ratio/low_mean": 4.0257837554236175e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.304857930037542e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15646.0,
+      "completions/mean_length": 6254.71875,
+      "completions/mean_terminated_length": 6174.96044921875,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.9090404361486435,
+      "epoch": 0.36338546458141674,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022540187928825617,
+      "learning_rate": 1e-05,
+      "loss": 0.0586,
+      "num_tokens": 327805417.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999850392341614,
+      "sampling/importance_sampling_ratio/min": 0.007726692594587803,
+      "sampling/sampling_logp_difference/max": 4.86307430267334,
+      "sampling/sampling_logp_difference/mean": 0.01917862705886364,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 2.4049867988651386e-05,
+      "clip_ratio/high_mean": 6.012466997162846e-06,
+      "clip_ratio/low_mean": 2.1124733166288934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7137200504512293e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16051.0,
+      "completions/mean_length": 7654.546875,
+      "completions/mean_terminated_length": 7225.22900390625,
+      "completions/min_length": 1708.0,
+      "completions/min_terminated_length": 1708.0,
+      "entropy": 0.9535491093993187,
+      "epoch": 0.36430542778288866,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0013819639571011066,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 328804303.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2301519513130188,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999759793281555,
+      "sampling/importance_sampling_ratio/min": 0.00017957323871087283,
+      "sampling/sampling_logp_difference/max": 8.624927520751953,
+      "sampling/sampling_logp_difference/mean": 0.019935712218284607,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 4.677968718169723e-06,
+      "clip_ratio/high_mean": 1.1694921795424307e-06,
+      "clip_ratio/low_mean": 4.5318136926653096e-05,
+      "clip_ratio/low_min": 1.0762409146991558e-05,
+      "clip_ratio/region_mean": 4.648762910619553e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 6929.859375,
+      "completions/mean_terminated_length": 6702.96044921875,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.8612276986241341,
+      "epoch": 0.36522539098436063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015145445941016078,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 329711437.0,
+      "reward": 0.4375,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998670220375061,
+      "sampling/importance_sampling_ratio/min": 6.962344286876032e-06,
+      "sampling/sampling_logp_difference/max": 11.874994277954102,
+      "sampling/sampling_logp_difference/mean": 0.01896081678569317,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 1.5800192159076687e-05,
+      "clip_ratio/high_mean": 5.8905598052660935e-06,
+      "clip_ratio/low_mean": 1.027900856342967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.616956859606944e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15568.0,
+      "completions/mean_length": 6751.09375,
+      "completions/mean_terminated_length": 6675.244140625,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "entropy": 1.008638858795166,
+      "epoch": 0.36614535418583255,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0010175694478675723,
+      "learning_rate": 1e-05,
+      "loss": -0.0079,
+      "num_tokens": 330594657.0,
+      "reward": 0.40625,
+      "reward_std": 0.17017142474651337,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999219179153442,
+      "sampling/importance_sampling_ratio/min": 6.605670205317438e-05,
+      "sampling/sampling_logp_difference/max": 9.62499713897705,
+      "sampling/sampling_logp_difference/mean": 0.019827818498015404,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 7.255490572788403e-06,
+      "clip_ratio/high_mean": 1.8138726431971008e-06,
+      "clip_ratio/low_mean": 4.20189051055786e-05,
+      "clip_ratio/low_min": 7.900641321612056e-06,
+      "clip_ratio/region_mean": 4.383277814667963e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16280.0,
+      "completions/mean_length": 7907.796875,
+      "completions/mean_terminated_length": 7563.2353515625,
+      "completions/min_length": 821.0,
+      "completions/min_terminated_length": 821.0,
+      "entropy": 0.8603325337171555,
+      "epoch": 0.3670653173873045,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014811329310759902,
+      "learning_rate": 1e-05,
+      "loss": 0.0714,
+      "num_tokens": 331626943.0,
+      "reward": 0.28125,
+      "reward_std": 0.2161829173564911,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998874068260193,
+      "sampling/importance_sampling_ratio/min": 3.0665268013763125e-07,
+      "sampling/sampling_logp_difference/max": 14.997550010681152,
+      "sampling/sampling_logp_difference/mean": 0.018387217074632645,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 1.2884957641290384e-05,
+      "clip_ratio/high_mean": 4.083570104285172e-06,
+      "clip_ratio/low_mean": 1.6143149423442082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.022671930035358e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16206.0,
+      "completions/mean_length": 7498.40625,
+      "completions/mean_terminated_length": 7137.203125,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 1.0180751085281372,
+      "epoch": 0.36798528058877644,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001668943208642304,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 332605987.0,
+      "reward": 0.3359375,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725222587585,
+      "sampling/importance_sampling_ratio/min": 3.239733814552892e-08,
+      "sampling/sampling_logp_difference/max": 17.245189666748047,
+      "sampling/sampling_logp_difference/mean": 0.020663965493440628,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8121567652306112e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8121567652306112e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15740.0,
+      "completions/mean_length": 6650.4453125,
+      "completions/mean_terminated_length": 6495.94482421875,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 0.9293805658817291,
+      "epoch": 0.3689052437902484,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0036925526801496744,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 333475324.0,
+      "reward": 0.3828125,
+      "reward_std": 0.19674427807331085,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999297857284546,
+      "sampling/importance_sampling_ratio/min": 0.0019147126004099846,
+      "sampling/sampling_logp_difference/max": 6.258187770843506,
+      "sampling/sampling_logp_difference/mean": 0.01987956464290619,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 9.03130421647802e-06,
+      "clip_ratio/high_mean": 2.257826054119505e-06,
+      "clip_ratio/low_mean": 3.9613908143110166e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.187173419722967e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14410.0,
+      "completions/mean_length": 6967.6328125,
+      "completions/mean_terminated_length": 6663.87890625,
+      "completions/min_length": 658.0,
+      "completions/min_terminated_length": 658.0,
+      "entropy": 0.8103456348180771,
+      "epoch": 0.36982520699172033,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015530216041952372,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 334389053.0,
+      "reward": 0.4765625,
+      "reward_std": 0.29932138323783875,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999937415122986,
+      "sampling/importance_sampling_ratio/min": 1.2903526112495456e-05,
+      "sampling/sampling_logp_difference/max": 11.258009910583496,
+      "sampling/sampling_logp_difference/mean": 0.018520750105381012,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 7.21459082342335e-06,
+      "clip_ratio/high_mean": 1.8036477058558376e-06,
+      "clip_ratio/low_mean": 2.5680752742118784e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7484400334287784e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15788.0,
+      "completions/mean_length": 6583.15625,
+      "completions/mean_terminated_length": 6427.587890625,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 1.0669879838824272,
+      "epoch": 0.37074517019319225,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023163470905274153,
+      "learning_rate": 1e-05,
+      "loss": 0.0332,
+      "num_tokens": 335249113.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2867175340652466,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999435544013977,
+      "sampling/importance_sampling_ratio/min": 0.0013276290846988559,
+      "sampling/sampling_logp_difference/max": 6.62436056137085,
+      "sampling/sampling_logp_difference/mean": 0.020729750394821167,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 1.915673669827811e-05,
+      "clip_ratio/high_mean": 4.789184174569527e-06,
+      "clip_ratio/low_mean": 4.268036605026282e-05,
+      "clip_ratio/low_min": 6.225874585652491e-06,
+      "clip_ratio/region_mean": 4.746955005430209e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15942.0,
+      "completions/mean_length": 7847.734375,
+      "completions/mean_terminated_length": 7712.23876953125,
+      "completions/min_length": 1127.0,
+      "completions/min_terminated_length": 1127.0,
+      "entropy": 1.0450394004583359,
+      "epoch": 0.3716651333946642,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0011931186309084296,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 336270823.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000035047531128,
+      "sampling/importance_sampling_ratio/min": 0.004087730310857296,
+      "sampling/sampling_logp_difference/max": 5.499765396118164,
+      "sampling/sampling_logp_difference/mean": 0.02191723883152008,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 7.73082024352334e-06,
+      "clip_ratio/high_mean": 1.932705060880835e-06,
+      "clip_ratio/low_mean": 2.2936642153581488e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4869347271305742e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15621.0,
+      "completions/mean_length": 6286.1953125,
+      "completions/mean_terminated_length": 6206.68505859375,
+      "completions/min_length": 918.0,
+      "completions/min_terminated_length": 918.0,
+      "entropy": 1.0122173130512238,
+      "epoch": 0.37258509659613614,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0032431832514703274,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 337095136.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24275578558444977,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999330639839172,
+      "sampling/importance_sampling_ratio/min": 2.1024358431986911e-07,
+      "sampling/sampling_logp_difference/max": 15.374999046325684,
+      "sampling/sampling_logp_difference/mean": 0.021477293223142624,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 9.451312507735565e-06,
+      "clip_ratio/high_mean": 2.3628281269338913e-06,
+      "clip_ratio/low_mean": 1.8447401316734613e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.081022921629483e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15792.0,
+      "completions/max_terminated_length": 15792.0,
+      "completions/mean_length": 7430.8125,
+      "completions/mean_terminated_length": 7430.8125,
+      "completions/min_length": 534.0,
+      "completions/min_terminated_length": 534.0,
+      "entropy": 1.1211064383387566,
+      "epoch": 0.3735050597976081,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012266195844858885,
+      "learning_rate": 1e-05,
+      "loss": 0.0132,
+      "num_tokens": 338069448.0,
+      "reward": 0.234375,
+      "reward_std": 0.17965975403785706,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999444484710693,
+      "sampling/importance_sampling_ratio/min": 0.0013370488304644823,
+      "sampling/sampling_logp_difference/max": 6.617290496826172,
+      "sampling/sampling_logp_difference/mean": 0.02237049862742424,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 1.1666743375826627e-05,
+      "clip_ratio/high_mean": 2.9166858439566568e-06,
+      "clip_ratio/low_mean": 3.927663362901512e-05,
+      "clip_ratio/low_min": 4.591199740389129e-06,
+      "clip_ratio/region_mean": 4.2193319245598104e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15672.0,
+      "completions/max_terminated_length": 15672.0,
+      "completions/mean_length": 6209.578125,
+      "completions/mean_terminated_length": 6209.578125,
+      "completions/min_length": 757.0,
+      "completions/min_terminated_length": 757.0,
+      "entropy": 0.9696918427944183,
+      "epoch": 0.37442502299908004,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002120936056599021,
+      "learning_rate": 1e-05,
+      "loss": 0.0554,
+      "num_tokens": 338883986.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999944806098938,
+      "sampling/importance_sampling_ratio/min": 0.000961031299084425,
+      "sampling/sampling_logp_difference/max": 6.947503566741943,
+      "sampling/sampling_logp_difference/mean": 0.0204964317381382,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 3.829187789960997e-06,
+      "clip_ratio/high_mean": 9.572969474902493e-07,
+      "clip_ratio/low_mean": 4.5606326921188156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.656362375499157e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15322.0,
+      "completions/max_terminated_length": 15322.0,
+      "completions/mean_length": 6625.140625,
+      "completions/mean_terminated_length": 6625.140625,
+      "completions/min_length": 1063.0,
+      "completions/min_terminated_length": 1063.0,
+      "entropy": 1.0780328214168549,
+      "epoch": 0.37534498620055196,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021016194950789213,
+      "learning_rate": 1e-05,
+      "loss": 0.0664,
+      "num_tokens": 339753228.0,
+      "reward": 0.359375,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 0.00479263998568058,
+      "sampling/sampling_logp_difference/max": 5.340673923492432,
+      "sampling/sampling_logp_difference/mean": 0.02143041603267193,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 1.7951345853362e-05,
+      "clip_ratio/high_mean": 4.4878364633405e-06,
+      "clip_ratio/low_mean": 3.357411151228007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8061947634560056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16335.0,
+      "completions/mean_length": 7494.2109375,
+      "completions/mean_terminated_length": 7207.443359375,
+      "completions/min_length": 62.0,
+      "completions/min_terminated_length": 62.0,
+      "entropy": 1.0134501904249191,
+      "epoch": 0.37626494940202393,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0017506639705970883,
+      "learning_rate": 1e-05,
+      "loss": 0.0361,
+      "num_tokens": 340731983.0,
+      "reward": 0.34375,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999791383743286,
+      "sampling/importance_sampling_ratio/min": 6.919008654904246e-08,
+      "sampling/sampling_logp_difference/max": 16.486408233642578,
+      "sampling/sampling_logp_difference/mean": 0.020142192021012306,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 2.0409703665791312e-05,
+      "clip_ratio/high_mean": 7.713539844189654e-06,
+      "clip_ratio/low_mean": 3.658559990071808e-05,
+      "clip_ratio/low_min": 3.80390133614128e-06,
+      "clip_ratio/region_mean": 4.429913997228141e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15238.0,
+      "completions/mean_length": 6724.828125,
+      "completions/mean_terminated_length": 6493.00830078125,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "entropy": 0.961749866604805,
+      "epoch": 0.37718491260349585,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014797865878790617,
+      "learning_rate": 1e-05,
+      "loss": -0.0195,
+      "num_tokens": 341613265.0,
+      "reward": 0.5,
+      "reward_std": 0.3145885467529297,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999843835830688,
+      "sampling/importance_sampling_ratio/min": 1.6481149941682816e-05,
+      "sampling/sampling_logp_difference/max": 11.013293266296387,
+      "sampling/sampling_logp_difference/mean": 0.021053435280919075,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 8.271860679087695e-06,
+      "clip_ratio/high_mean": 2.0679651697719237e-06,
+      "clip_ratio/low_mean": 2.1166565488783817e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.323453065855574e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14961.0,
+      "completions/mean_length": 6513.5625,
+      "completions/mean_terminated_length": 6195.1611328125,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.8742869198322296,
+      "epoch": 0.3781048758049678,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018223582301288843,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 342466337.0,
+      "reward": 0.5,
+      "reward_std": 0.20593318343162537,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690651893616,
+      "sampling/importance_sampling_ratio/min": 0.0027132700197398663,
+      "sampling/sampling_logp_difference/max": 5.909600734710693,
+      "sampling/sampling_logp_difference/mean": 0.01892159879207611,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 1.867416995082749e-05,
+      "clip_ratio/high_mean": 4.668542487706873e-06,
+      "clip_ratio/low_mean": 5.194308118916524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6611622540003737e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15859.0,
+      "completions/max_terminated_length": 15859.0,
+      "completions/mean_length": 7088.0390625,
+      "completions/mean_terminated_length": 7088.0390625,
+      "completions/min_length": 748.0,
+      "completions/min_terminated_length": 748.0,
+      "entropy": 0.8695354089140892,
+      "epoch": 0.37902483900643974,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00121080141980201,
+      "learning_rate": 1e-05,
+      "loss": 0.0095,
+      "num_tokens": 343393318.0,
+      "reward": 0.515625,
+      "reward_std": 0.3009189963340759,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.0003100235771853477,
+      "sampling/sampling_logp_difference/max": 8.078862190246582,
+      "sampling/sampling_logp_difference/mean": 0.01892455853521824,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 3.6179024164084694e-05,
+      "clip_ratio/high_mean": 9.044756041021174e-06,
+      "clip_ratio/low_mean": 3.288474886176118e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1929504845938936e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15178.0,
+      "completions/mean_length": 6221.6484375,
+      "completions/mean_terminated_length": 6141.6298828125,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.937163233757019,
+      "epoch": 0.37994480220791166,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002599990228191018,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 344207225.0,
+      "reward": 0.390625,
+      "reward_std": 0.348238468170166,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999527335166931,
+      "sampling/importance_sampling_ratio/min": 3.535756695782766e-05,
+      "sampling/sampling_logp_difference/max": 10.249998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019875720143318176,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.69036411534762e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.69036411534762e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 6664.46875,
+      "completions/mean_terminated_length": 6587.93701171875,
+      "completions/min_length": 1317.0,
+      "completions/min_terminated_length": 1317.0,
+      "entropy": 1.0893034785985947,
+      "epoch": 0.38086476540938363,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0012395181693136692,
+      "learning_rate": 1e-05,
+      "loss": 0.0358,
+      "num_tokens": 345082629.0,
+      "reward": 0.3984375,
+      "reward_std": 0.23145011067390442,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999253153800964,
+      "sampling/importance_sampling_ratio/min": 0.0004444181395228952,
+      "sampling/sampling_logp_difference/max": 7.71874475479126,
+      "sampling/sampling_logp_difference/mean": 0.022249475121498108,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 3.8116729683679296e-06,
+      "clip_ratio/high_mean": 9.529182420919824e-07,
+      "clip_ratio/low_mean": 1.930760379309504e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0260522319404117e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16020.0,
+      "completions/mean_length": 5986.390625,
+      "completions/mean_terminated_length": 5904.51953125,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9476369470357895,
+      "epoch": 0.38178472861085555,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011368105188012123,
+      "learning_rate": 1e-05,
+      "loss": 0.0414,
+      "num_tokens": 345869327.0,
+      "reward": 0.40625,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999204277992249,
+      "sampling/importance_sampling_ratio/min": 0.0007102401577867568,
+      "sampling/sampling_logp_difference/max": 7.249907493591309,
+      "sampling/sampling_logp_difference/mean": 0.019328134134411812,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 2.638578052938101e-06,
+      "clip_ratio/high_mean": 6.596445132345252e-07,
+      "clip_ratio/low_mean": 2.8019193905493012e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8678838418727537e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15247.0,
+      "completions/mean_length": 7780.8046875,
+      "completions/mean_terminated_length": 7574.328125,
+      "completions/min_length": 771.0,
+      "completions/min_terminated_length": 771.0,
+      "entropy": 0.9548748508095741,
+      "epoch": 0.3827046918123275,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016439391765743494,
+      "learning_rate": 1e-05,
+      "loss": 0.0134,
+      "num_tokens": 346885974.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999086856842041,
+      "sampling/importance_sampling_ratio/min": 0.0041214353404939175,
+      "sampling/sampling_logp_difference/max": 5.491553783416748,
+      "sampling/sampling_logp_difference/mean": 0.020669173449277878,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 8.280869224108756e-06,
+      "clip_ratio/high_mean": 2.070217306027189e-06,
+      "clip_ratio/low_mean": 3.338867099955678e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5458888532957644e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15766.0,
+      "completions/mean_length": 7118.4921875,
+      "completions/mean_terminated_length": 6582.470703125,
+      "completions/min_length": 833.0,
+      "completions/min_terminated_length": 833.0,
+      "entropy": 0.9908356294035912,
+      "epoch": 0.38362465501379944,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002354196272790432,
+      "learning_rate": 1e-05,
+      "loss": 0.037,
+      "num_tokens": 347818245.0,
+      "reward": 0.421875,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998934268951416,
+      "sampling/importance_sampling_ratio/min": 7.691462087677792e-05,
+      "sampling/sampling_logp_difference/max": 9.472814559936523,
+      "sampling/sampling_logp_difference/mean": 0.020420750603079796,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 4.261557478457689e-06,
+      "clip_ratio/high_mean": 1.0653893696144223e-06,
+      "clip_ratio/low_mean": 3.0260198514042713e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1325587883657136e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15213.0,
+      "completions/mean_length": 7016.0546875,
+      "completions/mean_terminated_length": 6791.22412109375,
+      "completions/min_length": 907.0,
+      "completions/min_terminated_length": 907.0,
+      "entropy": 0.9372202381491661,
+      "epoch": 0.3845446182152714,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002695834031328559,
+      "learning_rate": 1e-05,
+      "loss": 0.0356,
+      "num_tokens": 348734852.0,
+      "reward": 0.484375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999836087226868,
+      "sampling/importance_sampling_ratio/min": 3.6898933331031003e-07,
+      "sampling/sampling_logp_difference/max": 14.812498092651367,
+      "sampling/sampling_logp_difference/mean": 0.01997985690832138,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 1.4203505088516977e-05,
+      "clip_ratio/high_mean": 4.557706688501639e-06,
+      "clip_ratio/low_mean": 3.802522951446008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.258293620296172e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15005.0,
+      "completions/max_terminated_length": 15005.0,
+      "completions/mean_length": 6170.859375,
+      "completions/mean_terminated_length": 6170.859375,
+      "completions/min_length": 894.0,
+      "completions/min_terminated_length": 894.0,
+      "entropy": 0.7692223712801933,
+      "epoch": 0.38546458141674333,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003598283976316452,
+      "learning_rate": 1e-05,
+      "loss": 0.0745,
+      "num_tokens": 349543850.0,
+      "reward": 0.625,
+      "reward_std": 0.37875327467918396,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999639987945557,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.017690379172563553,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 3.7454306038853247e-06,
+      "clip_ratio/high_mean": 9.363576509713312e-07,
+      "clip_ratio/low_mean": 2.0118780639677425e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1055138290648756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14385.0,
+      "completions/mean_length": 6198.5859375,
+      "completions/mean_terminated_length": 6118.3857421875,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "entropy": 1.0641538202762604,
+      "epoch": 0.38638454461821525,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003362868446856737,
+      "learning_rate": 1e-05,
+      "loss": 0.0385,
+      "num_tokens": 350358493.0,
+      "reward": 0.4375,
+      "reward_std": 0.2432974874973297,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000051498413086,
+      "sampling/importance_sampling_ratio/min": 9.425564826415211e-07,
+      "sampling/sampling_logp_difference/max": 13.874670028686523,
+      "sampling/sampling_logp_difference/mean": 0.01945672184228897,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.529027955868514e-05,
+      "clip_ratio/low_min": 1.1817648100986844e-05,
+      "clip_ratio/region_mean": 4.529027955868514e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 7429.953125,
+      "completions/mean_terminated_length": 6833.01708984375,
+      "completions/min_length": 1152.0,
+      "completions/min_terminated_length": 1152.0,
+      "entropy": 0.7885174229741096,
+      "epoch": 0.3873045078196872,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0020358162000775337,
+      "learning_rate": 1e-05,
+      "loss": 0.0665,
+      "num_tokens": 351327135.0,
+      "reward": 0.3984375,
+      "reward_std": 0.31800347566604614,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483823776245,
+      "sampling/importance_sampling_ratio/min": 4.07999541494064e-05,
+      "sampling/sampling_logp_difference/max": 10.106829643249512,
+      "sampling/sampling_logp_difference/mean": 0.017557526007294655,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 1.2953943951288238e-05,
+      "clip_ratio/high_mean": 4.294050768294255e-06,
+      "clip_ratio/low_mean": 2.7448330115475983e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174238065639656e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16185.0,
+      "completions/max_terminated_length": 16185.0,
+      "completions/mean_length": 7466.75,
+      "completions/mean_terminated_length": 7466.75,
+      "completions/min_length": 311.0,
+      "completions/min_terminated_length": 311.0,
+      "entropy": 0.9798530638217926,
+      "epoch": 0.38822447102115915,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019255588995292783,
+      "learning_rate": 1e-05,
+      "loss": 0.0395,
+      "num_tokens": 352300247.0,
+      "reward": 0.265625,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999645352363586,
+      "sampling/importance_sampling_ratio/min": 0.0010790677042677999,
+      "sampling/sampling_logp_difference/max": 6.831657886505127,
+      "sampling/sampling_logp_difference/mean": 0.020764775574207306,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 1.4318582771011279e-05,
+      "clip_ratio/high_mean": 3.5796456927528197e-06,
+      "clip_ratio/low_mean": 1.4836090599601448e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8415736349197687e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16182.0,
+      "completions/mean_length": 6264.40625,
+      "completions/mean_terminated_length": 6021.5361328125,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "entropy": 0.8464985340833664,
+      "epoch": 0.3891444342226311,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016904048388823867,
+      "learning_rate": 1e-05,
+      "loss": 0.0434,
+      "num_tokens": 353122747.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2738093435764313,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999396800994873,
+      "sampling/importance_sampling_ratio/min": 1.569278902024962e-05,
+      "sampling/sampling_logp_difference/max": 11.062309265136719,
+      "sampling/sampling_logp_difference/mean": 0.018584076315164566,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 1.6524649709026562e-05,
+      "clip_ratio/high_mean": 5.198334406486538e-06,
+      "clip_ratio/low_mean": 5.1570618779805955e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.676895318629249e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16051.0,
+      "completions/max_terminated_length": 16051.0,
+      "completions/mean_length": 5848.3359375,
+      "completions/mean_terminated_length": 5848.3359375,
+      "completions/min_length": 705.0,
+      "completions/min_terminated_length": 705.0,
+      "entropy": 1.0793062299489975,
+      "epoch": 0.39006439742410304,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015838779509067535,
+      "learning_rate": 1e-05,
+      "loss": -0.0144,
+      "num_tokens": 353888374.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999014139175415,
+      "sampling/importance_sampling_ratio/min": 0.0002261155314045027,
+      "sampling/sampling_logp_difference/max": 8.394464492797852,
+      "sampling/sampling_logp_difference/mean": 0.020625369623303413,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 2.2546613308804808e-05,
+      "clip_ratio/high_mean": 5.636653327201202e-06,
+      "clip_ratio/low_mean": 4.848485787078971e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.048513922796701e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14583.0,
+      "completions/mean_length": 5917.984375,
+      "completions/mean_terminated_length": 5751.857421875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.8621423915028572,
+      "epoch": 0.39098436062557496,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.002542395843192935,
+      "learning_rate": 1e-05,
+      "loss": 0.053,
+      "num_tokens": 354665052.0,
+      "reward": 0.6484375,
+      "reward_std": 0.13941732048988342,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999158382415771,
+      "sampling/importance_sampling_ratio/min": 0.00038012932054698467,
+      "sampling/sampling_logp_difference/max": 7.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.0170799158513546,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 1.1686064681271091e-05,
+      "clip_ratio/high_mean": 2.9215161703177728e-06,
+      "clip_ratio/low_mean": 1.6330765674865688e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9252282072557136e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 6513.65625,
+      "completions/mean_terminated_length": 6435.93701171875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0047430396080017,
+      "epoch": 0.39190432382704693,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0008743361104279757,
+      "learning_rate": 1e-05,
+      "loss": 0.0568,
+      "num_tokens": 355526744.0,
+      "reward": 0.3125,
+      "reward_std": 0.16097761690616608,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999683499336243,
+      "sampling/importance_sampling_ratio/min": 5.006812898500357e-06,
+      "sampling/sampling_logp_difference/max": 12.204710960388184,
+      "sampling/sampling_logp_difference/mean": 0.020237455144524574,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 1.7667963220446836e-05,
+      "clip_ratio/high_mean": 4.416990805111709e-06,
+      "clip_ratio/low_mean": 2.390649478911655e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.832348559422826e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13700.0,
+      "completions/max_terminated_length": 13700.0,
+      "completions/mean_length": 6363.9375,
+      "completions/mean_terminated_length": 6363.9375,
+      "completions/min_length": 1118.0,
+      "completions/min_terminated_length": 1118.0,
+      "entropy": 0.910186342895031,
+      "epoch": 0.39282428702851885,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034290661569684744,
+      "learning_rate": 1e-05,
+      "loss": 0.0773,
+      "num_tokens": 356359920.0,
+      "reward": 0.4296875,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 0.0023352939169853926,
+      "sampling/sampling_logp_difference/max": 6.059617519378662,
+      "sampling/sampling_logp_difference/mean": 0.019128751009702682,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 1.9295963738841238e-05,
+      "clip_ratio/high_mean": 4.823990934710309e-06,
+      "clip_ratio/low_mean": 3.187764491485723e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.67016357358807e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14673.0,
+      "completions/max_terminated_length": 14673.0,
+      "completions/mean_length": 6206.5859375,
+      "completions/mean_terminated_length": 6206.5859375,
+      "completions/min_length": 988.0,
+      "completions/min_terminated_length": 988.0,
+      "entropy": 0.8695667088031769,
+      "epoch": 0.3937442502299908,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022478618193417788,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 357172435.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3332657814025879,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000020146369934,
+      "sampling/importance_sampling_ratio/min": 1.993246769416146e-06,
+      "sampling/sampling_logp_difference/max": 13.12574577331543,
+      "sampling/sampling_logp_difference/mean": 0.019101407378911972,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 2.577107125034672e-06,
+      "clip_ratio/high_mean": 6.44276781258668e-07,
+      "clip_ratio/low_mean": 3.719566507243144e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.783994179684669e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14648.0,
+      "completions/mean_length": 6773.65625,
+      "completions/mean_terminated_length": 6697.984375,
+      "completions/min_length": 1150.0,
+      "completions/min_terminated_length": 1150.0,
+      "entropy": 1.0704292133450508,
+      "epoch": 0.39466421343146274,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030995130073279142,
+      "learning_rate": 1e-05,
+      "loss": 0.0409,
+      "num_tokens": 358060623.0,
+      "reward": 0.3515625,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589323997498,
+      "sampling/importance_sampling_ratio/min": 1.8965129129355773e-05,
+      "sampling/sampling_logp_difference/max": 10.872908592224121,
+      "sampling/sampling_logp_difference/mean": 0.02080383338034153,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 1.0044732334790751e-05,
+      "clip_ratio/high_mean": 3.6204799016559264e-06,
+      "clip_ratio/low_mean": 3.683777390506293e-05,
+      "clip_ratio/low_min": 4.640285169443814e-06,
+      "clip_ratio/region_mean": 4.045825380671886e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 6753.4609375,
+      "completions/mean_terminated_length": 6442.79833984375,
+      "completions/min_length": 901.0,
+      "completions/min_terminated_length": 901.0,
+      "entropy": 0.8907509669661522,
+      "epoch": 0.39558417663293466,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0025211002212017775,
+      "learning_rate": 1e-05,
+      "loss": 0.0812,
+      "num_tokens": 358942514.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33691808581352234,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513030052185,
+      "sampling/importance_sampling_ratio/min": 0.001427572569809854,
+      "sampling/sampling_logp_difference/max": 6.551779747009277,
+      "sampling/sampling_logp_difference/mean": 0.019076799973845482,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 2.213625748481718e-05,
+      "clip_ratio/high_mean": 5.534064371204295e-06,
+      "clip_ratio/low_mean": 4.042425916850334e-05,
+      "clip_ratio/low_min": 4.858519787376281e-06,
+      "clip_ratio/region_mean": 4.59583234260208e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16165.0,
+      "completions/max_terminated_length": 16165.0,
+      "completions/mean_length": 5878.4921875,
+      "completions/mean_terminated_length": 5878.4921875,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "entropy": 0.8234230354428291,
+      "epoch": 0.39650413983440663,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023358019534498453,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 359716041.0,
+      "reward": 0.53125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998354911804199,
+      "sampling/importance_sampling_ratio/min": 0.0008571944781579077,
+      "sampling/sampling_logp_difference/max": 7.061845779418945,
+      "sampling/sampling_logp_difference/mean": 0.018851958215236664,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 7.793237045916612e-06,
+      "clip_ratio/high_mean": 1.948309261479153e-06,
+      "clip_ratio/low_mean": 5.3089813718543155e-05,
+      "clip_ratio/low_min": 3.7982376852596644e-06,
+      "clip_ratio/region_mean": 5.503812303686573e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15028.0,
+      "completions/mean_length": 6296.0078125,
+      "completions/mean_terminated_length": 6135.88134765625,
+      "completions/min_length": 1187.0,
+      "completions/min_terminated_length": 1187.0,
+      "entropy": 0.9341304004192352,
+      "epoch": 0.39742410303587855,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002632992109283805,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 360544066.0,
+      "reward": 0.390625,
+      "reward_std": 0.30433881282806396,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999693036079407,
+      "sampling/importance_sampling_ratio/min": 0.00015875507961027324,
+      "sampling/sampling_logp_difference/max": 8.748147964477539,
+      "sampling/sampling_logp_difference/mean": 0.01882069557905197,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 1.8652748622116633e-05,
+      "clip_ratio/high_mean": 4.663187155529158e-06,
+      "clip_ratio/low_mean": 3.725770324081168e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1920890453184256e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15766.0,
+      "completions/mean_length": 7325.359375,
+      "completions/mean_terminated_length": 6957.12158203125,
+      "completions/min_length": 945.0,
+      "completions/min_terminated_length": 945.0,
+      "entropy": 0.7979409247636795,
+      "epoch": 0.3983440662373505,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002110559493303299,
+      "learning_rate": 1e-05,
+      "loss": 0.0474,
+      "num_tokens": 361502504.0,
+      "reward": 0.4921875,
+      "reward_std": 0.21436071395874023,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999271631240845,
+      "sampling/importance_sampling_ratio/min": 1.778415753506124e-05,
+      "sampling/sampling_logp_difference/max": 10.937202453613281,
+      "sampling/sampling_logp_difference/mean": 0.018452363088726997,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 5.034029982198263e-06,
+      "clip_ratio/high_mean": 1.2585074955495656e-06,
+      "clip_ratio/low_mean": 2.1098365436955646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2356872932505212e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 5471.5625,
+      "completions/mean_terminated_length": 5385.6376953125,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "entropy": 0.8691592514514923,
+      "epoch": 0.39926402943882244,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038794223219156265,
+      "learning_rate": 1e-05,
+      "loss": -0.041,
+      "num_tokens": 362220856.0,
+      "reward": 0.546875,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.0027285523246973753,
+      "sampling/sampling_logp_difference/max": 5.903984069824219,
+      "sampling/sampling_logp_difference/mean": 0.01814887300133705,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 1.2709096154139843e-05,
+      "clip_ratio/high_mean": 3.1772740385349607e-06,
+      "clip_ratio/low_mean": 4.124845816022571e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.442573271035144e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 5305.328125,
+      "completions/mean_terminated_length": 5218.09423828125,
+      "completions/min_length": 542.0,
+      "completions/min_terminated_length": 542.0,
+      "entropy": 0.7804318591952324,
+      "epoch": 0.40018399264029436,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029884849209338427,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 362921226.0,
+      "reward": 0.6328125,
+      "reward_std": 0.3505876660346985,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999871015548706,
+      "sampling/importance_sampling_ratio/min": 0.0024799995590001345,
+      "sampling/sampling_logp_difference/max": 5.999496936798096,
+      "sampling/sampling_logp_difference/mean": 0.017358118668198586,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 4.018904746772023e-06,
+      "clip_ratio/high_mean": 1.9869055449817097e-06,
+      "clip_ratio/low_mean": 3.535901299756006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.734591876991544e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15577.0,
+      "completions/max_terminated_length": 15577.0,
+      "completions/mean_length": 7197.6328125,
+      "completions/mean_terminated_length": 7197.6328125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9466754496097565,
+      "epoch": 0.40110395584176634,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0023567057214677334,
+      "learning_rate": 1e-05,
+      "loss": 0.1036,
+      "num_tokens": 363863579.0,
+      "reward": 0.375,
+      "reward_std": 0.2924865484237671,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 3.132333574740187e-07,
+      "sampling/sampling_logp_difference/max": 14.976317405700684,
+      "sampling/sampling_logp_difference/mean": 0.020331334322690964,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 3.7869606330787065e-06,
+      "clip_ratio/high_mean": 9.467401582696766e-07,
+      "clip_ratio/low_mean": 4.479868130147224e-05,
+      "clip_ratio/low_min": 5.061343472334556e-06,
+      "clip_ratio/region_mean": 4.57454214028985e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15503.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6058.7890625,
+      "completions/mean_terminated_length": 6058.7890625,
+      "completions/min_length": 732.0,
+      "completions/min_terminated_length": 732.0,
+      "entropy": 0.9345398098230362,
+      "epoch": 0.40202391904323825,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018098369473591447,
+      "learning_rate": 1e-05,
+      "loss": 0.1307,
+      "num_tokens": 364660120.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293684959412,
+      "sampling/importance_sampling_ratio/min": 0.004112724680453539,
+      "sampling/sampling_logp_difference/max": 5.493669509887695,
+      "sampling/sampling_logp_difference/mean": 0.019891154021024704,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 1.2886742979389965e-05,
+      "clip_ratio/high_mean": 3.221685744847491e-06,
+      "clip_ratio/low_mean": 4.962291495758109e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.284460121401935e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16003.0,
+      "completions/mean_length": 6929.984375,
+      "completions/mean_terminated_length": 6625.01611328125,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9930986166000366,
+      "epoch": 0.4029438822447102,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033301038201898336,
+      "learning_rate": 1e-05,
+      "loss": 0.0313,
+      "num_tokens": 365564662.0,
+      "reward": 0.3828125,
+      "reward_std": 0.30457618832588196,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.0009120093891397119,
+      "sampling/sampling_logp_difference/max": 6.9998602867126465,
+      "sampling/sampling_logp_difference/mean": 0.02060488425195217,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 1.3284722399475868e-05,
+      "clip_ratio/high_mean": 3.321180599868967e-06,
+      "clip_ratio/low_mean": 2.590538883850968e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.922656926784839e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14903.0,
+      "completions/max_terminated_length": 14903.0,
+      "completions/mean_length": 6197.3671875,
+      "completions/mean_terminated_length": 6197.3671875,
+      "completions/min_length": 845.0,
+      "completions/min_terminated_length": 845.0,
+      "entropy": 0.9469878897070885,
+      "epoch": 0.40386384544618215,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003049476072192192,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 366379725.0,
+      "reward": 0.421875,
+      "reward_std": 0.3253750801086426,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999247789382935,
+      "sampling/importance_sampling_ratio/min": 0.0005533178336918354,
+      "sampling/sampling_logp_difference/max": 7.49957799911499,
+      "sampling/sampling_logp_difference/mean": 0.019666746258735657,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 1.4212190535545233e-05,
+      "clip_ratio/high_mean": 3.553047633886308e-06,
+      "clip_ratio/low_mean": 4.362488289189059e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7177931264741346e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15647.0,
+      "completions/mean_length": 6331.6015625,
+      "completions/mean_terminated_length": 6007.33056640625,
+      "completions/min_length": 600.0,
+      "completions/min_terminated_length": 600.0,
+      "entropy": 0.9937634319067001,
+      "epoch": 0.4047838086476541,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001529635745100677,
+      "learning_rate": 1e-05,
+      "loss": 0.0863,
+      "num_tokens": 367207994.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2732901871204376,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998211860656738,
+      "sampling/importance_sampling_ratio/min": 0.0013787593925371766,
+      "sampling/sampling_logp_difference/max": 6.586571216583252,
+      "sampling/sampling_logp_difference/mean": 0.02042214572429657,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 1.3438677797239507e-05,
+      "clip_ratio/high_mean": 4.353689405434125e-06,
+      "clip_ratio/low_mean": 2.1308957457222277e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5662646748969564e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14323.0,
+      "completions/mean_length": 6679.5,
+      "completions/mean_terminated_length": 6525.4609375,
+      "completions/min_length": 894.0,
+      "completions/min_terminated_length": 894.0,
+      "entropy": 1.034226231276989,
+      "epoch": 0.40570377184912604,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002576075494289398,
+      "learning_rate": 1e-05,
+      "loss": 0.0037,
+      "num_tokens": 368085602.0,
+      "reward": 0.4921875,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999170899391174,
+      "sampling/importance_sampling_ratio/min": 0.02749871462583542,
+      "sampling/sampling_logp_difference/max": 3.593616008758545,
+      "sampling/sampling_logp_difference/mean": 0.02129797264933586,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 1.2707126188615803e-05,
+      "clip_ratio/high_mean": 3.1767815471539507e-06,
+      "clip_ratio/low_mean": 5.362682486520498e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6803606184985256e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14425.0,
+      "completions/mean_length": 7171.984375,
+      "completions/mean_terminated_length": 6874.822265625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "entropy": 0.994599312543869,
+      "epoch": 0.40662373505059796,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003648000070825219,
+      "learning_rate": 1e-05,
+      "loss": 0.0468,
+      "num_tokens": 369021400.0,
+      "reward": 0.34375,
+      "reward_std": 0.3174794614315033,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999703764915466,
+      "sampling/importance_sampling_ratio/min": 2.1446328901220113e-05,
+      "sampling/sampling_logp_difference/max": 10.749957084655762,
+      "sampling/sampling_logp_difference/mean": 0.02128203772008419,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 4.010523753095185e-06,
+      "clip_ratio/high_mean": 1.0026309382737963e-06,
+      "clip_ratio/low_mean": 5.049121273259516e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.149384355718212e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15812.0,
+      "completions/mean_length": 7633.953125,
+      "completions/mean_terminated_length": 7203.62255859375,
+      "completions/min_length": 746.0,
+      "completions/min_terminated_length": 746.0,
+      "entropy": 0.9781397357583046,
+      "epoch": 0.40754369825206993,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002637698082253337,
+      "learning_rate": 1e-05,
+      "loss": 0.1255,
+      "num_tokens": 370022274.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3106446862220764,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999527931213379,
+      "sampling/importance_sampling_ratio/min": 0.0006269909208640456,
+      "sampling/sampling_logp_difference/max": 7.374578475952148,
+      "sampling/sampling_logp_difference/mean": 0.02037280797958374,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 8.796280781098176e-06,
+      "clip_ratio/high_mean": 2.199070195274544e-06,
+      "clip_ratio/low_mean": 2.404907445452409e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6248144422424957e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14346.0,
+      "completions/mean_length": 6966.890625,
+      "completions/mean_terminated_length": 6892.740234375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 1.0748675763607025,
+      "epoch": 0.40846366145354185,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002537182765081525,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 370936076.0,
+      "reward": 0.421875,
+      "reward_std": 0.24329747259616852,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999483823776245,
+      "sampling/importance_sampling_ratio/min": 0.001600474352017045,
+      "sampling/sampling_logp_difference/max": 6.437455177307129,
+      "sampling/sampling_logp_difference/mean": 0.0208933986723423,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 1.888703832264582e-05,
+      "clip_ratio/high_mean": 4.721759580661455e-06,
+      "clip_ratio/low_mean": 3.932560184694012e-05,
+      "clip_ratio/low_min": 3.3643752885836875e-06,
+      "clip_ratio/region_mean": 4.404736250762653e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16226.0,
+      "completions/mean_length": 7487.2890625,
+      "completions/mean_terminated_length": 7346.07177734375,
+      "completions/min_length": 792.0,
+      "completions/min_terminated_length": 792.0,
+      "entropy": 0.9402988106012344,
+      "epoch": 0.4093836246550138,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0016896538436412811,
+      "learning_rate": 1e-05,
+      "loss": 0.0569,
+      "num_tokens": 371915793.0,
+      "reward": 0.3125,
+      "reward_std": 0.32849061489105225,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999174475669861,
+      "sampling/importance_sampling_ratio/min": 4.222911684337305e-06,
+      "sampling/sampling_logp_difference/max": 12.374985694885254,
+      "sampling/sampling_logp_difference/mean": 0.018897607922554016,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 1.2214306025271071e-05,
+      "clip_ratio/high_mean": 3.0535765063177678e-06,
+      "clip_ratio/low_mean": 1.0073189514514524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.3126766020832292e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14307.0,
+      "completions/max_terminated_length": 14307.0,
+      "completions/mean_length": 5188.9375,
+      "completions/mean_terminated_length": 5188.9375,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "entropy": 0.8868530839681625,
+      "epoch": 0.41030358785648574,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001575644128024578,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 372605969.0,
+      "reward": 0.5390625,
+      "reward_std": 0.1938612163066864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999008774757385,
+      "sampling/importance_sampling_ratio/min": 0.0020112686324864626,
+      "sampling/sampling_logp_difference/max": 6.20898962020874,
+      "sampling/sampling_logp_difference/mean": 0.017719607800245285,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 1.6542175217182375e-05,
+      "clip_ratio/high_mean": 6.5401112578911125e-06,
+      "clip_ratio/low_mean": 3.020691053734481e-05,
+      "clip_ratio/low_min": 4.941101906297263e-06,
+      "clip_ratio/region_mean": 3.674702134048857e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 7290.9140625,
+      "completions/mean_terminated_length": 7146.57958984375,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "entropy": 1.06352149695158,
+      "epoch": 0.41122355105795766,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020332508720457554,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 373557094.0,
+      "reward": 0.40625,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998309016227722,
+      "sampling/importance_sampling_ratio/min": 8.97010977496393e-06,
+      "sampling/sampling_logp_difference/max": 11.621612548828125,
+      "sampling/sampling_logp_difference/mean": 0.022010326385498047,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 8.10710616860888e-06,
+      "clip_ratio/high_mean": 2.02677654215222e-06,
+      "clip_ratio/low_mean": 5.330761632649228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.5334393664452364e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15476.0,
+      "completions/mean_length": 6881.6640625,
+      "completions/mean_terminated_length": 6495.39013671875,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 0.9094375595450401,
+      "epoch": 0.41214351425942963,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0019624519627541304,
+      "learning_rate": 1e-05,
+      "loss": 0.0492,
+      "num_tokens": 374459827.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999911785125732,
+      "sampling/importance_sampling_ratio/min": 3.292101524721147e-08,
+      "sampling/sampling_logp_difference/max": 17.229154586791992,
+      "sampling/sampling_logp_difference/mean": 0.019491354003548622,
+      "step": 448
+    },
+    {
+      "clip_ratio/high_max": 2.0297283754189266e-05,
+      "clip_ratio/high_mean": 5.0743209385473165e-06,
+      "clip_ratio/low_mean": 3.7426975950438646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.250129745742015e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14792.0,
+      "completions/mean_length": 6641.203125,
+      "completions/mean_terminated_length": 6245.154296875,
+      "completions/min_length": 925.0,
+      "completions/min_terminated_length": 925.0,
+      "entropy": 0.7556380406022072,
+      "epoch": 0.41306347746090155,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0010716031538322568,
+      "learning_rate": 1e-05,
+      "loss": 0.1355,
+      "num_tokens": 375331749.0,
+      "reward": 0.625,
+      "reward_std": 0.34876543283462524,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000039339065552,
+      "sampling/importance_sampling_ratio/min": 0.00010258897236781195,
+      "sampling/sampling_logp_difference/max": 9.18478012084961,
+      "sampling/sampling_logp_difference/mean": 0.017056716606020927,
+      "step": 449
+    },
+    {
+      "clip_ratio/high_max": 2.1341018509701826e-05,
+      "clip_ratio/high_mean": 5.335254627425456e-06,
+      "clip_ratio/low_mean": 4.72563451694441e-05,
+      "clip_ratio/low_min": 6.4834025579330046e-06,
+      "clip_ratio/region_mean": 5.259159979686956e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15757.0,
+      "completions/max_terminated_length": 15757.0,
+      "completions/mean_length": 6514.875,
+      "completions/mean_terminated_length": 6514.875,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.9535354822874069,
+      "epoch": 0.4139834406623735,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025929149705916643,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 376183309.0,
+      "reward": 0.421875,
+      "reward_std": 0.28277361392974854,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998977184295654,
+      "sampling/importance_sampling_ratio/min": 0.002191081177443266,
+      "sampling/sampling_logp_difference/max": 6.1233601570129395,
+      "sampling/sampling_logp_difference/mean": 0.019740387797355652,
+      "step": 450
+    },
+    {
+      "clip_ratio/high_max": 1.2529956165963085e-05,
+      "clip_ratio/high_mean": 4.370210831439181e-06,
+      "clip_ratio/low_mean": 6.38160736343707e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.075181819487625e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15798.0,
+      "completions/mean_length": 6045.640625,
+      "completions/mean_terminated_length": 5964.236328125,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 1.0733412355184555,
+      "epoch": 0.41490340386384544,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023648168426007032,
+      "learning_rate": 1e-05,
+      "loss": 0.005,
+      "num_tokens": 376978175.0,
+      "reward": 0.421875,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999704360961914,
+      "sampling/importance_sampling_ratio/min": 0.0001392154226778075,
+      "sampling/sampling_logp_difference/max": 8.879487991333008,
+      "sampling/sampling_logp_difference/mean": 0.020569145679473877,
+      "step": 451
+    },
+    {
+      "clip_ratio/high_max": 4.286840976419626e-06,
+      "clip_ratio/high_mean": 1.0717102441049065e-06,
+      "clip_ratio/low_mean": 2.4207001501963532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5278711859755276e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7767.7578125,
+      "completions/mean_terminated_length": 7489.814453125,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "entropy": 1.0381295159459114,
+      "epoch": 0.41582336706531736,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0015338027151301503,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "num_tokens": 377994592.0,
+      "reward": 0.4140625,
+      "reward_std": 0.14230038225650787,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999648332595825,
+      "sampling/importance_sampling_ratio/min": 8.825274733226252e-08,
+      "sampling/sampling_logp_difference/max": 16.243061065673828,
+      "sampling/sampling_logp_difference/mean": 0.02027149498462677,
+      "step": 452
+    },
+    {
+      "clip_ratio/high_max": 7.272515631484566e-06,
+      "clip_ratio/high_mean": 1.8181289078711416e-06,
+      "clip_ratio/low_mean": 2.767900923572597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949713825728395e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15264.0,
+      "completions/max_terminated_length": 15264.0,
+      "completions/mean_length": 7002.21875,
+      "completions/mean_terminated_length": 7002.21875,
+      "completions/min_length": 703.0,
+      "completions/min_terminated_length": 703.0,
+      "entropy": 1.0032588243484497,
+      "epoch": 0.41674333026678934,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002184878336265683,
+      "learning_rate": 1e-05,
+      "loss": 0.0439,
+      "num_tokens": 378909468.0,
+      "reward": 0.4453125,
+      "reward_std": 0.17859894037246704,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999321103096008,
+      "sampling/importance_sampling_ratio/min": 1.3040186786383856e-05,
+      "sampling/sampling_logp_difference/max": 11.247474670410156,
+      "sampling/sampling_logp_difference/mean": 0.02025642991065979,
+      "step": 453
+    },
+    {
+      "clip_ratio/high_max": 4.38227471022401e-06,
+      "clip_ratio/high_mean": 1.0955686775560025e-06,
+      "clip_ratio/low_mean": 2.8486808901106997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9582377578663e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 8433.3984375,
+      "completions/mean_terminated_length": 8042.384765625,
+      "completions/min_length": 1429.0,
+      "completions/min_terminated_length": 1429.0,
+      "entropy": 0.9339399412274361,
+      "epoch": 0.41766329346826125,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0015065330080688,
+      "learning_rate": 1e-05,
+      "loss": 0.0026,
+      "num_tokens": 380009687.0,
+      "reward": 0.3359375,
+      "reward_std": 0.17358636856079102,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999430179595947,
+      "sampling/importance_sampling_ratio/min": 0.0004234187363181263,
+      "sampling/sampling_logp_difference/max": 7.767148971557617,
+      "sampling/sampling_logp_difference/mean": 0.020081156864762306,
+      "step": 454
+    },
+    {
+      "clip_ratio/high_max": 1.8815874227584573e-05,
+      "clip_ratio/high_mean": 4.703968556896143e-06,
+      "clip_ratio/low_mean": 2.8154490735232685e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.285845917844199e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15186.0,
+      "completions/max_terminated_length": 15186.0,
+      "completions/mean_length": 7050.3203125,
+      "completions/mean_terminated_length": 7050.3203125,
+      "completions/min_length": 873.0,
+      "completions/min_terminated_length": 873.0,
+      "entropy": 0.9537717178463936,
+      "epoch": 0.41858325666973323,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0013606940628960729,
+      "learning_rate": 1e-05,
+      "loss": 0.0125,
+      "num_tokens": 380930480.0,
+      "reward": 0.578125,
+      "reward_std": 0.28407180309295654,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999956488609314,
+      "sampling/importance_sampling_ratio/min": 0.00011017238284694031,
+      "sampling/sampling_logp_difference/max": 9.11346435546875,
+      "sampling/sampling_logp_difference/mean": 0.020253805443644524,
+      "step": 455
+    },
+    {
+      "clip_ratio/high_max": 4.247366632625926e-06,
+      "clip_ratio/high_mean": 1.0618416581564816e-06,
+      "clip_ratio/low_mean": 2.397758157712815e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5039423462658306e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15698.0,
+      "completions/max_terminated_length": 15698.0,
+      "completions/mean_length": 6561.1640625,
+      "completions/mean_terminated_length": 6561.1640625,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.9863667339086533,
+      "epoch": 0.41950321987120515,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017187768826261163,
+      "learning_rate": 1e-05,
+      "loss": 0.0332,
+      "num_tokens": 381790981.0,
+      "reward": 0.4375,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998313188552856,
+      "sampling/importance_sampling_ratio/min": 0.010767512023448944,
+      "sampling/sampling_logp_difference/max": 4.531221866607666,
+      "sampling/sampling_logp_difference/mean": 0.02073034644126892,
+      "step": 456
+    },
+    {
+      "clip_ratio/high_max": 2.9292289127624827e-05,
+      "clip_ratio/high_mean": 8.657401849632151e-06,
+      "clip_ratio/low_mean": 4.3774077425950964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2431478707148926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15705.0,
+      "completions/mean_length": 7120.1875,
+      "completions/mean_terminated_length": 6973.14306640625,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.9760185852646828,
+      "epoch": 0.4204231830726771,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016811270033940673,
+      "learning_rate": 1e-05,
+      "loss": 0.0804,
+      "num_tokens": 382722173.0,
+      "reward": 0.421875,
+      "reward_std": 0.27670514583587646,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999004602432251,
+      "sampling/importance_sampling_ratio/min": 0.0008047398878261447,
+      "sampling/sampling_logp_difference/max": 7.124991416931152,
+      "sampling/sampling_logp_difference/mean": 0.02018534392118454,
+      "step": 457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.610178137274488e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.610178137274488e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 7057.1015625,
+      "completions/mean_terminated_length": 6833.25634765625,
+      "completions/min_length": 922.0,
+      "completions/min_terminated_length": 922.0,
+      "entropy": 0.948130652308464,
+      "epoch": 0.42134314627414904,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015492907259613276,
+      "learning_rate": 1e-05,
+      "loss": 0.0319,
+      "num_tokens": 383650426.0,
+      "reward": 0.421875,
+      "reward_std": 0.21040895581245422,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999640583992004,
+      "sampling/importance_sampling_ratio/min": 0.003965416923165321,
+      "sampling/sampling_logp_difference/max": 5.530144214630127,
+      "sampling/sampling_logp_difference/mean": 0.02065262943506241,
+      "step": 458
+    },
+    {
+      "clip_ratio/high_max": 8.952108146331739e-06,
+      "clip_ratio/high_mean": 2.2380270365829347e-06,
+      "clip_ratio/low_mean": 2.777617066840321e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.001419747761247e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 7001.7578125,
+      "completions/mean_terminated_length": 6852.83349609375,
+      "completions/min_length": 1065.0,
+      "completions/min_terminated_length": 1065.0,
+      "entropy": 0.9631693065166473,
+      "epoch": 0.42226310947562096,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0013419219758361578,
+      "learning_rate": 1e-05,
+      "loss": 0.0705,
+      "num_tokens": 384565995.0,
+      "reward": 0.390625,
+      "reward_std": 0.18701860308647156,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999476671218872,
+      "sampling/importance_sampling_ratio/min": 0.0006672164890915155,
+      "sampling/sampling_logp_difference/max": 7.312396049499512,
+      "sampling/sampling_logp_difference/mean": 0.01975739374756813,
+      "step": 459
+    },
+    {
+      "clip_ratio/high_max": 1.215636098095274e-05,
+      "clip_ratio/high_mean": 3.039090245238185e-06,
+      "clip_ratio/low_mean": 4.157363855483709e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4612729197979206e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15727.0,
+      "completions/mean_length": 7282.875,
+      "completions/mean_terminated_length": 6912.91015625,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9037974923849106,
+      "epoch": 0.42318307267709293,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021072588860988617,
+      "learning_rate": 1e-05,
+      "loss": 0.0866,
+      "num_tokens": 385516659.0,
+      "reward": 0.359375,
+      "reward_std": 0.3277292251586914,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0013449778780341148,
+      "sampling/sampling_logp_difference/max": 6.611377716064453,
+      "sampling/sampling_logp_difference/mean": 0.018494941294193268,
+      "step": 460
+    },
+    {
+      "clip_ratio/high_max": 1.669851098995423e-05,
+      "clip_ratio/high_mean": 4.174627747488557e-06,
+      "clip_ratio/low_mean": 2.594786496956658e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0122492944428814e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14783.0,
+      "completions/mean_length": 7063.6953125,
+      "completions/mean_terminated_length": 6840.00830078125,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9738125056028366,
+      "epoch": 0.42410303587856485,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020963819697499275,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 386440556.0,
+      "reward": 0.4765625,
+      "reward_std": 0.28930407762527466,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999623894691467,
+      "sampling/importance_sampling_ratio/min": 7.853446390981844e-07,
+      "sampling/sampling_logp_difference/max": 14.057143211364746,
+      "sampling/sampling_logp_difference/mean": 0.0198366716504097,
+      "step": 461
+    },
+    {
+      "clip_ratio/high_max": 3.949322490370832e-06,
+      "clip_ratio/high_mean": 9.87330622592708e-07,
+      "clip_ratio/low_mean": 1.8185473095400084e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9172803717992792e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15651.0,
+      "completions/mean_length": 7672.7421875,
+      "completions/mean_terminated_length": 7262.0244140625,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 1.0194172486662865,
+      "epoch": 0.4250229990800368,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014175203396007419,
+      "learning_rate": 1e-05,
+      "loss": 0.0232,
+      "num_tokens": 387450843.0,
+      "reward": 0.4609375,
+      "reward_std": 0.24541424214839935,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999865889549255,
+      "sampling/importance_sampling_ratio/min": 0.004501644056290388,
+      "sampling/sampling_logp_difference/max": 5.403312683105469,
+      "sampling/sampling_logp_difference/mean": 0.02058412693440914,
+      "step": 462
+    },
+    {
+      "clip_ratio/high_max": 2.1894326664551045e-05,
+      "clip_ratio/high_mean": 6.6363724613438535e-06,
+      "clip_ratio/low_mean": 8.431412652498693e-05,
+      "clip_ratio/low_min": 3.288245125077083e-05,
+      "clip_ratio/region_mean": 9.095049927054788e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 6846.8828125,
+      "completions/mean_terminated_length": 6459.19482421875,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.886472262442112,
+      "epoch": 0.42594296228150874,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002659202553331852,
+      "learning_rate": 1e-05,
+      "loss": 0.1199,
+      "num_tokens": 388344660.0,
+      "reward": 0.34375,
+      "reward_std": 0.40267258882522583,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000640153884888,
+      "sampling/importance_sampling_ratio/min": 0.00015848172188270837,
+      "sampling/sampling_logp_difference/max": 8.749871253967285,
+      "sampling/sampling_logp_difference/mean": 0.018909990787506104,
+      "step": 463
+    },
+    {
+      "clip_ratio/high_max": 1.3184767340135295e-05,
+      "clip_ratio/high_mean": 3.2961918350338237e-06,
+      "clip_ratio/low_mean": 4.2340758909631404e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.563695051729155e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16261.0,
+      "completions/mean_length": 6271.84375,
+      "completions/mean_terminated_length": 6029.15234375,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.9538674280047417,
+      "epoch": 0.42686292548298066,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002775643253698945,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 389167344.0,
+      "reward": 0.484375,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000534057617188,
+      "sampling/importance_sampling_ratio/min": 0.0022844907362014055,
+      "sampling/sampling_logp_difference/max": 6.0816121101379395,
+      "sampling/sampling_logp_difference/mean": 0.020731300115585327,
+      "step": 464
+    },
+    {
+      "clip_ratio/high_max": 5.017863713874249e-06,
+      "clip_ratio/high_mean": 1.2544659284685622e-06,
+      "clip_ratio/low_mean": 3.720694280673342e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.846140884888882e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6312.9765625,
+      "completions/mean_terminated_length": 6233.67724609375,
+      "completions/min_length": 833.0,
+      "completions/min_terminated_length": 833.0,
+      "entropy": 0.937890075147152,
+      "epoch": 0.42778288868445263,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.001834206865169108,
+      "learning_rate": 1e-05,
+      "loss": 0.0434,
+      "num_tokens": 389993613.0,
+      "reward": 0.484375,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000054836273193,
+      "sampling/importance_sampling_ratio/min": 0.0004770693776663393,
+      "sampling/sampling_logp_difference/max": 7.647848606109619,
+      "sampling/sampling_logp_difference/mean": 0.020461473613977432,
+      "step": 465
+    },
+    {
+      "clip_ratio/high_max": 1.484874360357935e-05,
+      "clip_ratio/high_mean": 3.7121859008948377e-06,
+      "clip_ratio/low_mean": 3.374425170932227e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7456437212313176e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15638.0,
+      "completions/mean_length": 5643.125,
+      "completions/mean_terminated_length": 5385.34423828125,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "entropy": 0.9210820645093918,
+      "epoch": 0.42870285188592455,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015243689995259047,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 390735629.0,
+      "reward": 0.4765625,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998995661735535,
+      "sampling/importance_sampling_ratio/min": 1.4338597509322426e-07,
+      "sampling/sampling_logp_difference/max": 15.757725715637207,
+      "sampling/sampling_logp_difference/mean": 0.01841399073600769,
+      "step": 466
+    },
+    {
+      "clip_ratio/high_max": 5.748976491304347e-06,
+      "clip_ratio/high_mean": 1.4372441228260868e-06,
+      "clip_ratio/low_mean": 3.702218441503646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.845942796942836e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16084.0,
+      "completions/mean_length": 8048.40625,
+      "completions/mean_terminated_length": 7848.3525390625,
+      "completions/min_length": 1236.0,
+      "completions/min_terminated_length": 1236.0,
+      "entropy": 1.048905499279499,
+      "epoch": 0.4296228150873965,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026433062739670277,
+      "learning_rate": 1e-05,
+      "loss": 0.0548,
+      "num_tokens": 391786761.0,
+      "reward": 0.265625,
+      "reward_std": 0.22962789237499237,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000020980834961,
+      "sampling/importance_sampling_ratio/min": 0.0006000763387419283,
+      "sampling/sampling_logp_difference/max": 7.418453693389893,
+      "sampling/sampling_logp_difference/mean": 0.021647389978170395,
+      "step": 467
+    },
+    {
+      "clip_ratio/high_max": 2.0228523908372154e-05,
+      "clip_ratio/high_mean": 5.057130977093038e-06,
+      "clip_ratio/low_mean": 5.334191632755392e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.839904770255089e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16197.0,
+      "completions/mean_length": 7073.078125,
+      "completions/mean_terminated_length": 6772.7255859375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 1.0020805671811104,
+      "epoch": 0.43054277828886844,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019431376131251454,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 392709699.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2914257347583771,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999632239341736,
+      "sampling/importance_sampling_ratio/min": 0.0003546403022482991,
+      "sampling/sampling_logp_difference/max": 7.944406509399414,
+      "sampling/sampling_logp_difference/mean": 0.020886382088065147,
+      "step": 468
+    },
+    {
+      "clip_ratio/high_max": 8.001388550837873e-06,
+      "clip_ratio/high_mean": 2.0003471377094684e-06,
+      "clip_ratio/low_mean": 5.976677766739158e-05,
+      "clip_ratio/low_min": 1.2241466720297467e-05,
+      "clip_ratio/region_mean": 6.176712395244977e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16249.0,
+      "completions/mean_length": 7128.5390625,
+      "completions/mean_terminated_length": 6981.62744140625,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "entropy": 0.9986839666962624,
+      "epoch": 0.43146274149034036,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002014609519392252,
+      "learning_rate": 1e-05,
+      "loss": 0.0787,
+      "num_tokens": 393643864.0,
+      "reward": 0.265625,
+      "reward_std": 0.3411741852760315,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000815391540527,
+      "sampling/importance_sampling_ratio/min": 0.0030073157977312803,
+      "sampling/sampling_logp_difference/max": 5.806707382202148,
+      "sampling/sampling_logp_difference/mean": 0.020323367789387703,
+      "step": 469
+    },
+    {
+      "clip_ratio/high_max": 1.0874447525566211e-05,
+      "clip_ratio/high_mean": 2.7186118813915527e-06,
+      "clip_ratio/low_mean": 3.265329507939896e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.537190696079051e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14445.0,
+      "completions/mean_length": 5538.625,
+      "completions/mean_terminated_length": 5366.4765625,
+      "completions/min_length": 1149.0,
+      "completions/min_terminated_length": 1149.0,
+      "entropy": 1.0297009721398354,
+      "epoch": 0.43238270469181234,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019020825857296586,
+      "learning_rate": 1e-05,
+      "loss": 0.0277,
+      "num_tokens": 394371184.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20699402689933777,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999639987945557,
+      "sampling/importance_sampling_ratio/min": 0.00010906249372055754,
+      "sampling/sampling_logp_difference/max": 9.123589515686035,
+      "sampling/sampling_logp_difference/mean": 0.01992623880505562,
+      "step": 470
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.427005844969244e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.427005844969244e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16072.0,
+      "completions/mean_length": 7305.7109375,
+      "completions/mean_terminated_length": 7087.83251953125,
+      "completions/min_length": 1106.0,
+      "completions/min_terminated_length": 1106.0,
+      "entropy": 0.9444865211844444,
+      "epoch": 0.43330266789328425,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037416366394609213,
+      "learning_rate": 1e-05,
+      "loss": 0.07,
+      "num_tokens": 395325427.0,
+      "reward": 0.375,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999666213989258,
+      "sampling/importance_sampling_ratio/min": 1.3054028613623814e-06,
+      "sampling/sampling_logp_difference/max": 13.548998832702637,
+      "sampling/sampling_logp_difference/mean": 0.02093587815761566,
+      "step": 471
+    },
+    {
+      "clip_ratio/high_max": 1.0206378192378907e-05,
+      "clip_ratio/high_mean": 2.5515945480947266e-06,
+      "clip_ratio/low_mean": 2.926629849753226e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.181789293194015e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16266.0,
+      "completions/mean_length": 6020.71875,
+      "completions/mean_terminated_length": 5686.4189453125,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "entropy": 0.9555193856358528,
+      "epoch": 0.43422263109475623,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003123396774753928,
+      "learning_rate": 1e-05,
+      "loss": 0.0906,
+      "num_tokens": 396118047.0,
+      "reward": 0.375,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 9.029568900587037e-05,
+      "sampling/sampling_logp_difference/max": 9.312420845031738,
+      "sampling/sampling_logp_difference/mean": 0.019349105656147003,
+      "step": 472
+    },
+    {
+      "clip_ratio/high_max": 7.391638519038679e-06,
+      "clip_ratio/high_mean": 1.8479096297596698e-06,
+      "clip_ratio/low_mean": 4.082024281615304e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.266815255959955e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16177.0,
+      "completions/mean_length": 6789.203125,
+      "completions/mean_terminated_length": 6149.55029296875,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.8103364855051041,
+      "epoch": 0.43514259429622815,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017067189328372478,
+      "learning_rate": 1e-05,
+      "loss": 0.0618,
+      "num_tokens": 397008497.0,
+      "reward": 0.421875,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000635385513306,
+      "sampling/importance_sampling_ratio/min": 1.8778002868202748e-06,
+      "sampling/sampling_logp_difference/max": 13.185409545898438,
+      "sampling/sampling_logp_difference/mean": 0.01813405565917492,
+      "step": 473
+    },
+    {
+      "clip_ratio/high_max": 3.4544700611149892e-06,
+      "clip_ratio/high_mean": 1.6775043150119018e-06,
+      "clip_ratio/low_mean": 3.894365818268852e-05,
+      "clip_ratio/low_min": 3.4544700611149892e-06,
+      "clip_ratio/region_mean": 4.0621162042953074e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16205.0,
+      "completions/mean_length": 8000.53125,
+      "completions/mean_terminated_length": 7934.51953125,
+      "completions/min_length": 911.0,
+      "completions/min_terminated_length": 911.0,
+      "entropy": 1.0201406553387642,
+      "epoch": 0.43606255749770007,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001533582923002541,
+      "learning_rate": 1e-05,
+      "loss": 0.0826,
+      "num_tokens": 398052373.0,
+      "reward": 0.328125,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000503063201904,
+      "sampling/importance_sampling_ratio/min": 3.783419288083678e-06,
+      "sampling/sampling_logp_difference/max": 12.484882354736328,
+      "sampling/sampling_logp_difference/mean": 0.02113974839448929,
+      "step": 474
+    },
+    {
+      "clip_ratio/high_max": 5.666878223564709e-06,
+      "clip_ratio/high_mean": 1.4167195558911772e-06,
+      "clip_ratio/low_mean": 1.8879915842262562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0296635739214253e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15121.0,
+      "completions/max_terminated_length": 15121.0,
+      "completions/mean_length": 6122.6875,
+      "completions/mean_terminated_length": 6122.6875,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 512.0,
+      "entropy": 1.0430640205740929,
+      "epoch": 0.43698252069917204,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0025845973286777735,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 398855205.0,
+      "reward": 0.5,
+      "reward_std": 0.24777325987815857,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999109506607056,
+      "sampling/importance_sampling_ratio/min": 3.3893353247549385e-05,
+      "sampling/sampling_logp_difference/max": 10.292291641235352,
+      "sampling/sampling_logp_difference/mean": 0.020821597427129745,
+      "step": 475
+    },
+    {
+      "clip_ratio/high_max": 6.862502914373181e-06,
+      "clip_ratio/high_mean": 1.7156257285932952e-06,
+      "clip_ratio/low_mean": 3.732125173883105e-05,
+      "clip_ratio/low_min": 3.870448381348979e-06,
+      "clip_ratio/region_mean": 3.9036877069520415e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16308.0,
+      "completions/mean_length": 6895.4453125,
+      "completions/mean_terminated_length": 6820.732421875,
+      "completions/min_length": 758.0,
+      "completions/min_terminated_length": 758.0,
+      "entropy": 1.097649298608303,
+      "epoch": 0.43790248390064396,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00137829699087888,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "num_tokens": 399758166.0,
+      "reward": 0.2890625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999674558639526,
+      "sampling/importance_sampling_ratio/min": 8.400417755183298e-06,
+      "sampling/sampling_logp_difference/max": 11.68722915649414,
+      "sampling/sampling_logp_difference/mean": 0.02135382406413555,
+      "step": 476
+    },
+    {
+      "clip_ratio/high_max": 8.859707577357767e-06,
+      "clip_ratio/high_mean": 2.2149268943394418e-06,
+      "clip_ratio/low_mean": 3.0371424600161845e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.258635138081445e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14854.0,
+      "completions/mean_length": 5552.265625,
+      "completions/mean_terminated_length": 5380.33349609375,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 0.9384580478072166,
+      "epoch": 0.43882244710211593,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002469305880367756,
+      "learning_rate": 1e-05,
+      "loss": 0.0868,
+      "num_tokens": 400488560.0,
+      "reward": 0.515625,
+      "reward_std": 0.29826050996780396,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998993277549744,
+      "sampling/importance_sampling_ratio/min": 1.934680221893359e-05,
+      "sampling/sampling_logp_difference/max": 10.852983474731445,
+      "sampling/sampling_logp_difference/mean": 0.019046220928430557,
+      "step": 477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.751295116671827e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.751295116671827e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6492.8125,
+      "completions/mean_terminated_length": 6335.81005859375,
+      "completions/min_length": 1238.0,
+      "completions/min_terminated_length": 1238.0,
+      "entropy": 0.9447641968727112,
+      "epoch": 0.43974241030358785,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019261077977716923,
+      "learning_rate": 1e-05,
+      "loss": 0.0684,
+      "num_tokens": 401339544.0,
+      "reward": 0.359375,
+      "reward_std": 0.27221953868865967,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999949932098389,
+      "sampling/importance_sampling_ratio/min": 0.016565052792429924,
+      "sampling/sampling_logp_difference/max": 4.100460052490234,
+      "sampling/sampling_logp_difference/mean": 0.018938450142741203,
+      "step": 478
+    },
+    {
+      "clip_ratio/high_max": 1.0270573739035171e-05,
+      "clip_ratio/high_mean": 2.567643434758793e-06,
+      "clip_ratio/low_mean": 3.2130441354638606e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4698084505180304e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15458.0,
+      "completions/mean_length": 6688.5546875,
+      "completions/mean_terminated_length": 6211.72900390625,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.9593756124377251,
+      "epoch": 0.4406623735050598,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027252996806055307,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 402213983.0,
+      "reward": 0.4375,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999223947525024,
+      "sampling/importance_sampling_ratio/min": 0.09333998709917068,
+      "sampling/sampling_logp_difference/max": 2.371506690979004,
+      "sampling/sampling_logp_difference/mean": 0.020656028762459755,
+      "step": 479
+    },
+    {
+      "clip_ratio/high_max": 4.220976734359283e-06,
+      "clip_ratio/high_mean": 1.0552441835898208e-06,
+      "clip_ratio/low_mean": 2.7019574872610974e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.807481928357447e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15739.0,
+      "completions/mean_length": 6957.8828125,
+      "completions/mean_terminated_length": 6808.26220703125,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "entropy": 0.9458145052194595,
+      "epoch": 0.44158233670653174,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021163993515074253,
+      "learning_rate": 1e-05,
+      "loss": -0.0054,
+      "num_tokens": 403124296.0,
+      "reward": 0.3125,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000032186508179,
+      "sampling/importance_sampling_ratio/min": 5.414607926468307e-07,
+      "sampling/sampling_logp_difference/max": 14.428995132446289,
+      "sampling/sampling_logp_difference/mean": 0.019670519977808,
+      "step": 480
+    },
+    {
+      "clip_ratio/high_max": 1.4141203109829803e-05,
+      "clip_ratio/high_mean": 4.24627120310106e-06,
+      "clip_ratio/low_mean": 3.319961399483873e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7445884800035856e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16107.0,
+      "completions/mean_length": 7141.8359375,
+      "completions/mean_terminated_length": 6843.701171875,
+      "completions/min_length": 1005.0,
+      "completions/min_terminated_length": 1005.0,
+      "entropy": 0.9727424532175064,
+      "epoch": 0.44250229990800366,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0024569793604314327,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 404056571.0,
+      "reward": 0.421875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999956488609314,
+      "sampling/importance_sampling_ratio/min": 8.950789379014168e-06,
+      "sampling/sampling_logp_difference/max": 11.62376880645752,
+      "sampling/sampling_logp_difference/mean": 0.020752113312482834,
+      "step": 481
+    },
+    {
+      "clip_ratio/high_max": 1.5587193956889678e-05,
+      "clip_ratio/high_mean": 4.596514145305264e-06,
+      "clip_ratio/low_mean": 6.96504166626255e-05,
+      "clip_ratio/low_min": 7.279775445567793e-06,
+      "clip_ratio/region_mean": 7.424693194479914e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 7685.046875,
+      "completions/mean_terminated_length": 7476.2724609375,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9029846489429474,
+      "epoch": 0.44342226310947563,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0019990119617432356,
+      "learning_rate": 1e-05,
+      "loss": 0.1109,
+      "num_tokens": 405058705.0,
+      "reward": 0.421875,
+      "reward_std": 0.38375797867774963,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999930262565613,
+      "sampling/importance_sampling_ratio/min": 0.002107172505930066,
+      "sampling/sampling_logp_difference/max": 6.162408351898193,
+      "sampling/sampling_logp_difference/mean": 0.01937328279018402,
+      "step": 482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7506703443359584e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7506703443359584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 7004.21875,
+      "completions/mean_terminated_length": 6779.1044921875,
+      "completions/min_length": 936.0,
+      "completions/min_terminated_length": 936.0,
+      "entropy": 0.9121566936373711,
+      "epoch": 0.44434222631094755,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029584914445877075,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 405974789.0,
+      "reward": 0.5234375,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000674724578857,
+      "sampling/importance_sampling_ratio/min": 0.000792751437984407,
+      "sampling/sampling_logp_difference/max": 7.140000820159912,
+      "sampling/sampling_logp_difference/mean": 0.019368886947631836,
+      "step": 483
+    },
+    {
+      "clip_ratio/high_max": 1.2470530009522918e-05,
+      "clip_ratio/high_mean": 3.1176325023807294e-06,
+      "clip_ratio/low_mean": 3.606646794196422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.918410050118837e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 6294.90625,
+      "completions/mean_terminated_length": 6215.46435546875,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.899978794157505,
+      "epoch": 0.4452621895124195,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001394490827806294,
+      "learning_rate": 1e-05,
+      "loss": 0.0376,
+      "num_tokens": 406798417.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2577856183052063,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000015497207642,
+      "sampling/importance_sampling_ratio/min": 0.0007101757801137865,
+      "sampling/sampling_logp_difference/max": 7.249998092651367,
+      "sampling/sampling_logp_difference/mean": 0.018764980137348175,
+      "step": 484
+    },
+    {
+      "clip_ratio/high_max": 1.568959305586759e-05,
+      "clip_ratio/high_mean": 3.9223982639668975e-06,
+      "clip_ratio/low_mean": 3.593084011299652e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.985323814958974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15685.0,
+      "completions/mean_length": 6940.046875,
+      "completions/mean_terminated_length": 6790.14306640625,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9006319642066956,
+      "epoch": 0.44618215271389144,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002361331367865205,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 407703351.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35611939430236816,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999904036521912,
+      "sampling/importance_sampling_ratio/min": 4.8537625843891874e-05,
+      "sampling/sampling_logp_difference/max": 9.933171272277832,
+      "sampling/sampling_logp_difference/mean": 0.019578561186790466,
+      "step": 485
+    },
+    {
+      "clip_ratio/high_max": 5.896504717384232e-06,
+      "clip_ratio/high_mean": 1.474126179346058e-06,
+      "clip_ratio/low_mean": 4.614499187027832e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7619118163311214e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 6362.484375,
+      "completions/mean_terminated_length": 6283.57470703125,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 0.9299133494496346,
+      "epoch": 0.44710211591536336,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027458088006824255,
+      "learning_rate": 1e-05,
+      "loss": 0.0537,
+      "num_tokens": 408537765.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3595392107963562,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999920129776001,
+      "sampling/importance_sampling_ratio/min": 0.0007113060564734042,
+      "sampling/sampling_logp_difference/max": 7.24840784072876,
+      "sampling/sampling_logp_difference/mean": 0.019821636378765106,
+      "step": 486
+    },
+    {
+      "clip_ratio/high_max": 2.0891785879939562e-05,
+      "clip_ratio/high_mean": 7.879635973040422e-06,
+      "clip_ratio/low_mean": 2.6475246386326035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.435488224567962e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15924.0,
+      "completions/max_terminated_length": 15924.0,
+      "completions/mean_length": 5226.765625,
+      "completions/mean_terminated_length": 5226.765625,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "entropy": 1.0277203470468521,
+      "epoch": 0.44802207911683534,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024366467259824276,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 409223903.0,
+      "reward": 0.546875,
+      "reward_std": 0.3006146252155304,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000044584274292,
+      "sampling/importance_sampling_ratio/min": 0.01590813137590885,
+      "sampling/sampling_logp_difference/max": 4.14092493057251,
+      "sampling/sampling_logp_difference/mean": 0.019991066306829453,
+      "step": 487
+    },
+    {
+      "clip_ratio/high_max": 9.688145382824587e-06,
+      "clip_ratio/high_mean": 2.4220363457061467e-06,
+      "clip_ratio/low_mean": 1.920005956890236e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.162209625566902e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12344.0,
+      "completions/max_terminated_length": 12344.0,
+      "completions/mean_length": 5051.0,
+      "completions/mean_terminated_length": 5051.0,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.8572651967406273,
+      "epoch": 0.44894204231830726,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0027017516549676657,
+      "learning_rate": 1e-05,
+      "loss": -0.003,
+      "num_tokens": 409895199.0,
+      "reward": 0.6015625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940037727356,
+      "sampling/importance_sampling_ratio/min": 5.7065666624112055e-05,
+      "sampling/sampling_logp_difference/max": 9.771307945251465,
+      "sampling/sampling_logp_difference/mean": 0.01831716299057007,
+      "step": 488
+    },
+    {
+      "clip_ratio/high_max": 1.5306721707020188e-05,
+      "clip_ratio/high_mean": 3.826680426755047e-06,
+      "clip_ratio/low_mean": 3.0764163398089295e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4590844165904855e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13702.0,
+      "completions/mean_length": 6231.9765625,
+      "completions/mean_terminated_length": 6070.83349609375,
+      "completions/min_length": 488.0,
+      "completions/min_terminated_length": 488.0,
+      "entropy": 0.9115571528673172,
+      "epoch": 0.44986200551977923,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021461176220327616,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 410711300.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2672119140625,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000066757202148,
+      "sampling/importance_sampling_ratio/min": 0.00019801831513177603,
+      "sampling/sampling_logp_difference/max": 8.527151107788086,
+      "sampling/sampling_logp_difference/mean": 0.019596103578805923,
+      "step": 489
+    },
+    {
+      "clip_ratio/high_max": 2.7797910661320202e-05,
+      "clip_ratio/high_mean": 9.322406867795507e-06,
+      "clip_ratio/low_mean": 6.275825364809862e-05,
+      "clip_ratio/low_min": 3.0194694318197435e-06,
+      "clip_ratio/region_mean": 7.208066119801515e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16059.0,
+      "completions/mean_length": 6766.4765625,
+      "completions/mean_terminated_length": 6375.52001953125,
+      "completions/min_length": 764.0,
+      "completions/min_terminated_length": 764.0,
+      "entropy": 0.8712737187743187,
+      "epoch": 0.45078196872125115,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0019740054849535227,
+      "learning_rate": 1e-05,
+      "loss": 0.091,
+      "num_tokens": 411597969.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3521803915500641,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997878074646,
+      "sampling/importance_sampling_ratio/min": 7.488903065677732e-05,
+      "sampling/sampling_logp_difference/max": 9.499503135681152,
+      "sampling/sampling_logp_difference/mean": 0.018991166725754738,
+      "step": 490
+    },
+    {
+      "clip_ratio/high_max": 4.992810318071861e-06,
+      "clip_ratio/high_mean": 1.2482025795179652e-06,
+      "clip_ratio/low_mean": 1.100720277236178e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.2255405295036326e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14776.0,
+      "completions/max_terminated_length": 14776.0,
+      "completions/mean_length": 6619.1171875,
+      "completions/mean_terminated_length": 6619.1171875,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 1.1462209969758987,
+      "epoch": 0.45170193192272307,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.001665184274315834,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 412464384.0,
+      "reward": 0.3046875,
+      "reward_std": 0.17806214094161987,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999956488609314,
+      "sampling/importance_sampling_ratio/min": 0.009808298200368881,
+      "sampling/sampling_logp_difference/max": 4.624526500701904,
+      "sampling/sampling_logp_difference/mean": 0.02124062180519104,
+      "step": 491
+    },
+    {
+      "clip_ratio/high_max": 1.5520400665991474e-05,
+      "clip_ratio/high_mean": 3.8801001664978685e-06,
+      "clip_ratio/low_mean": 2.0763711063409573e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.464381134359428e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 7035.25,
+      "completions/mean_terminated_length": 6886.857421875,
+      "completions/min_length": 821.0,
+      "completions/min_terminated_length": 821.0,
+      "entropy": 0.9810440614819527,
+      "epoch": 0.45262189512419504,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0015779118984937668,
+      "learning_rate": 1e-05,
+      "loss": 0.0582,
+      "num_tokens": 413383792.0,
+      "reward": 0.4453125,
+      "reward_std": 0.21436068415641785,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999344944953918,
+      "sampling/importance_sampling_ratio/min": 0.01566622592508793,
+      "sampling/sampling_logp_difference/max": 4.156248092651367,
+      "sampling/sampling_logp_difference/mean": 0.021432677283883095,
+      "step": 492
+    },
+    {
+      "clip_ratio/high_max": 4.644250566343544e-06,
+      "clip_ratio/high_mean": 1.161062641585886e-06,
+      "clip_ratio/low_mean": 3.4143843777201255e-05,
+      "clip_ratio/low_min": 3.276024699516711e-06,
+      "clip_ratio/region_mean": 3.530490653247398e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15869.0,
+      "completions/mean_length": 6945.9375,
+      "completions/mean_terminated_length": 6796.12744140625,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.7932121306657791,
+      "epoch": 0.45354185832566696,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013254050863906741,
+      "learning_rate": 1e-05,
+      "loss": 0.0357,
+      "num_tokens": 414290000.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999009370803833,
+      "sampling/importance_sampling_ratio/min": 7.031726272543892e-05,
+      "sampling/sampling_logp_difference/max": 9.562493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018196485936641693,
+      "step": 493
+    },
+    {
+      "clip_ratio/high_max": 1.8977402305608848e-05,
+      "clip_ratio/high_mean": 4.744350576402212e-06,
+      "clip_ratio/low_mean": 3.744401988114987e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218837011649157e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14930.0,
+      "completions/mean_length": 7526.4375,
+      "completions/mean_terminated_length": 7313.8564453125,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "entropy": 0.9790460616350174,
+      "epoch": 0.45446182152713893,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001866620616056025,
+      "learning_rate": 1e-05,
+      "loss": 0.0707,
+      "num_tokens": 415272280.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2517249584197998,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998944997787476,
+      "sampling/importance_sampling_ratio/min": 0.00020347593817859888,
+      "sampling/sampling_logp_difference/max": 8.49996280670166,
+      "sampling/sampling_logp_difference/mean": 0.020433884114027023,
+      "step": 494
+    },
+    {
+      "clip_ratio/high_max": 7.432954589603469e-06,
+      "clip_ratio/high_mean": 3.44574186783575e-06,
+      "clip_ratio/low_mean": 4.426451175731927e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7710253397781344e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15964.0,
+      "completions/mean_length": 6108.8671875,
+      "completions/mean_terminated_length": 5862.26416015625,
+      "completions/min_length": 527.0,
+      "completions/min_terminated_length": 527.0,
+      "entropy": 0.8818904608488083,
+      "epoch": 0.45538178472861085,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002431972650811076,
+      "learning_rate": 1e-05,
+      "loss": 0.0175,
+      "num_tokens": 416072591.0,
+      "reward": 0.59375,
+      "reward_std": 0.26720699667930603,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999450445175171,
+      "sampling/importance_sampling_ratio/min": 0.001706472015939653,
+      "sampling/sampling_logp_difference/max": 6.373327255249023,
+      "sampling/sampling_logp_difference/mean": 0.01932165026664734,
+      "step": 495
+    },
+    {
+      "clip_ratio/high_max": 9.704292551759863e-06,
+      "clip_ratio/high_mean": 2.426073137939966e-06,
+      "clip_ratio/low_mean": 1.47394894156605e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7165562553600466e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15239.0,
+      "completions/max_terminated_length": 15239.0,
+      "completions/mean_length": 6841.59375,
+      "completions/mean_terminated_length": 6841.59375,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 1.1732418313622475,
+      "epoch": 0.4563017479300828,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002203838201239705,
+      "learning_rate": 1e-05,
+      "loss": 0.0308,
+      "num_tokens": 416966187.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2637920379638672,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998947381973267,
+      "sampling/importance_sampling_ratio/min": 0.0004944052780047059,
+      "sampling/sampling_logp_difference/max": 7.612154960632324,
+      "sampling/sampling_logp_difference/mean": 0.02160799130797386,
+      "step": 496
+    },
+    {
+      "clip_ratio/high_max": 2.328647701688169e-05,
+      "clip_ratio/high_mean": 5.821619254220423e-06,
+      "clip_ratio/low_mean": 5.462882245410583e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.0450441651482834e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13836.0,
+      "completions/max_terminated_length": 13836.0,
+      "completions/mean_length": 5898.7421875,
+      "completions/mean_terminated_length": 5898.7421875,
+      "completions/min_length": 675.0,
+      "completions/min_terminated_length": 675.0,
+      "entropy": 0.9141146093606949,
+      "epoch": 0.45722171113155474,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028326623141765594,
+      "learning_rate": 1e-05,
+      "loss": 0.0662,
+      "num_tokens": 417740586.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32984596490859985,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998853206634521,
+      "sampling/importance_sampling_ratio/min": 1.0281119102728553e-06,
+      "sampling/sampling_logp_difference/max": 13.787786483764648,
+      "sampling/sampling_logp_difference/mean": 0.01856965571641922,
+      "step": 497
+    },
+    {
+      "clip_ratio/high_max": 2.667783610377228e-05,
+      "clip_ratio/high_mean": 6.66945902594307e-06,
+      "clip_ratio/low_mean": 4.455613873233233e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.122559878145694e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16314.0,
+      "completions/mean_length": 6416.140625,
+      "completions/mean_terminated_length": 6176.912109375,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.8854602724313736,
+      "epoch": 0.45814167433302666,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001950124162249267,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 418579788.0,
+      "reward": 0.5078125,
+      "reward_std": 0.25012245774269104,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998780488967896,
+      "sampling/importance_sampling_ratio/min": 2.6581541533232667e-05,
+      "sampling/sampling_logp_difference/max": 10.535293579101562,
+      "sampling/sampling_logp_difference/mean": 0.01931869424879551,
+      "step": 498
+    },
+    {
+      "clip_ratio/high_max": 3.6452713629842037e-06,
+      "clip_ratio/high_mean": 9.113178407460509e-07,
+      "clip_ratio/low_mean": 3.819847256636422e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910979035026685e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15548.0,
+      "completions/mean_length": 7738.2578125,
+      "completions/mean_terminated_length": 7313.05712890625,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.9239770472049713,
+      "epoch": 0.45906163753449863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0016899642068892717,
+      "learning_rate": 1e-05,
+      "loss": 0.0844,
+      "num_tokens": 419589021.0,
+      "reward": 0.375,
+      "reward_std": 0.20069600641727448,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000193119049072,
+      "sampling/importance_sampling_ratio/min": 0.00016869053069967777,
+      "sampling/sampling_logp_difference/max": 8.687444686889648,
+      "sampling/sampling_logp_difference/mean": 0.01966589316725731,
+      "step": 499
+    },
+    {
+      "clip_ratio/high_max": 1.0700351140258135e-05,
+      "clip_ratio/high_mean": 2.675087785064534e-06,
+      "clip_ratio/low_mean": 3.456382330568886e-05,
+      "clip_ratio/low_min": 4.663483196054585e-06,
+      "clip_ratio/region_mean": 3.723891120444023e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16240.0,
+      "completions/mean_length": 7594.921875,
+      "completions/mean_terminated_length": 7383.984375,
+      "completions/min_length": 1049.0,
+      "completions/min_terminated_length": 1049.0,
+      "entropy": 0.9970445707440376,
+      "epoch": 0.45998160073597055,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0026633136440068483,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 420579459.0,
+      "reward": 0.40625,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000439882278442,
+      "sampling/importance_sampling_ratio/min": 0.000756366120185703,
+      "sampling/sampling_logp_difference/max": 7.186985015869141,
+      "sampling/sampling_logp_difference/mean": 0.020969431847333908,
+      "step": 500
+    },
+    {
+      "clip_ratio/high_max": 2.166650710933027e-05,
+      "clip_ratio/high_mean": 6.6261792426303145e-06,
+      "clip_ratio/low_mean": 5.730952580051962e-05,
+      "clip_ratio/low_min": 4.826068561669672e-06,
+      "clip_ratio/region_mean": 6.393570629370515e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14856.0,
+      "completions/max_terminated_length": 14856.0,
+      "completions/mean_length": 5897.2890625,
+      "completions/mean_terminated_length": 5897.2890625,
+      "completions/min_length": 57.0,
+      "completions/min_terminated_length": 57.0,
+      "entropy": 0.9427390918135643,
+      "epoch": 0.4609015639374425,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015446916222572327,
+      "learning_rate": 1e-05,
+      "loss": -0.0487,
+      "num_tokens": 421354536.0,
+      "reward": 0.40625,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000121593475342,
+      "sampling/importance_sampling_ratio/min": 0.00037080893525853753,
+      "sampling/sampling_logp_difference/max": 7.8998236656188965,
+      "sampling/sampling_logp_difference/mean": 0.019464563578367233,
+      "step": 501
+    },
+    {
+      "clip_ratio/high_max": 3.1168960958893877e-06,
+      "clip_ratio/high_mean": 7.792240239723469e-07,
+      "clip_ratio/low_mean": 1.842527422013518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9204498244107526e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16365.0,
+      "completions/mean_length": 7197.1875,
+      "completions/mean_terminated_length": 6900.83837890625,
+      "completions/min_length": 1181.0,
+      "completions/min_terminated_length": 1181.0,
+      "entropy": 0.9357216581702232,
+      "epoch": 0.46182152713891444,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019460292533040047,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 422296632.0,
+      "reward": 0.4921875,
+      "reward_std": 0.20934812724590302,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999638795852661,
+      "sampling/importance_sampling_ratio/min": 0.0004937088815495372,
+      "sampling/sampling_logp_difference/max": 7.613564491271973,
+      "sampling/sampling_logp_difference/mean": 0.0199101734906435,
+      "step": 502
+    },
+    {
+      "clip_ratio/high_max": 3.01917771139415e-06,
+      "clip_ratio/high_mean": 7.547944278485375e-07,
+      "clip_ratio/low_mean": 2.4536840555811068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5291634983659605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16092.0,
+      "completions/mean_length": 6675.8515625,
+      "completions/mean_terminated_length": 6599.40966796875,
+      "completions/min_length": 1369.0,
+      "completions/min_terminated_length": 1369.0,
+      "entropy": 0.8980752006173134,
+      "epoch": 0.46274149034038636,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017490689642727375,
+      "learning_rate": 1e-05,
+      "loss": 0.0634,
+      "num_tokens": 423170085.0,
+      "reward": 0.484375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999966025352478,
+      "sampling/importance_sampling_ratio/min": 4.0153237932827324e-05,
+      "sampling/sampling_logp_difference/max": 10.122807502746582,
+      "sampling/sampling_logp_difference/mean": 0.01868046447634697,
+      "step": 503
+    },
+    {
+      "clip_ratio/high_max": 1.4156895304040518e-05,
+      "clip_ratio/high_mean": 4.290660626793397e-06,
+      "clip_ratio/low_mean": 4.468955739866942e-05,
+      "clip_ratio/low_min": 3.951194685214432e-06,
+      "clip_ratio/region_mean": 4.898021779808914e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 7394.5546875,
+      "completions/mean_terminated_length": 6874.50390625,
+      "completions/min_length": 909.0,
+      "completions/min_terminated_length": 909.0,
+      "entropy": 0.891602098941803,
+      "epoch": 0.46366145354185834,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026261890307068825,
+      "learning_rate": 1e-05,
+      "loss": 0.0981,
+      "num_tokens": 424134916.0,
+      "reward": 0.484375,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0019415394635871053,
+      "sampling/sampling_logp_difference/max": 6.244274139404297,
+      "sampling/sampling_logp_difference/mean": 0.018863018602132797,
+      "step": 504
+    },
+    {
+      "clip_ratio/high_max": 4.867222287430195e-06,
+      "clip_ratio/high_mean": 1.2168055718575488e-06,
+      "clip_ratio/low_mean": 2.737805482411204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8594860509656428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16243.0,
+      "completions/mean_length": 5508.3359375,
+      "completions/mean_terminated_length": 5422.70068359375,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.9608336761593819,
+      "epoch": 0.46458141674333026,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030600661411881447,
+      "learning_rate": 1e-05,
+      "loss": 0.0369,
+      "num_tokens": 424860847.0,
+      "reward": 0.5625,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999776482582092,
+      "sampling/importance_sampling_ratio/min": 4.006533345091157e-05,
+      "sampling/sampling_logp_difference/max": 10.124999046325684,
+      "sampling/sampling_logp_difference/mean": 0.018935665488243103,
+      "step": 505
+    },
+    {
+      "clip_ratio/high_max": 1.3109260635246756e-05,
+      "clip_ratio/high_mean": 3.277315158811689e-06,
+      "clip_ratio/low_mean": 3.854507008327346e-05,
+      "clip_ratio/low_min": 2.992077043018071e-06,
+      "clip_ratio/region_mean": 4.182238512839831e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16077.0,
+      "completions/mean_length": 7779.4765625,
+      "completions/mean_terminated_length": 7572.96826171875,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 1.0322985425591469,
+      "epoch": 0.46550137994480223,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002075409982353449,
+      "learning_rate": 1e-05,
+      "loss": 0.0939,
+      "num_tokens": 425877532.0,
+      "reward": 0.421875,
+      "reward_std": 0.3337898254394531,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999785423278809,
+      "sampling/importance_sampling_ratio/min": 0.025282513350248337,
+      "sampling/sampling_logp_difference/max": 3.677642345428467,
+      "sampling/sampling_logp_difference/mean": 0.020769601687788963,
+      "step": 506
+    },
+    {
+      "clip_ratio/high_max": 1.4176180684444262e-05,
+      "clip_ratio/high_mean": 4.564619985103491e-06,
+      "clip_ratio/low_mean": 2.2551324207142898e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7115944419620064e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15292.0,
+      "completions/mean_length": 6004.21875,
+      "completions/mean_terminated_length": 5755.1044921875,
+      "completions/min_length": 992.0,
+      "completions/min_terminated_length": 992.0,
+      "entropy": 0.9162944257259369,
+      "epoch": 0.46642134314627415,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0039940495043993,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 426666008.0,
+      "reward": 0.6328125,
+      "reward_std": 0.31140607595443726,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 5.144598981132731e-05,
+      "sampling/sampling_logp_difference/max": 9.874978065490723,
+      "sampling/sampling_logp_difference/mean": 0.01873711869120598,
+      "step": 507
+    },
+    {
+      "clip_ratio/high_max": 3.6937442473572446e-06,
+      "clip_ratio/high_mean": 9.234360618393112e-07,
+      "clip_ratio/low_mean": 3.4857803484555916e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.578123954639523e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14337.0,
+      "completions/mean_length": 6619.6015625,
+      "completions/mean_terminated_length": 6542.71630859375,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 1.1118961870670319,
+      "epoch": 0.46734130634774607,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002274538855999708,
+      "learning_rate": 1e-05,
+      "loss": 0.0259,
+      "num_tokens": 427535397.0,
+      "reward": 0.3125,
+      "reward_std": 0.2177756428718567,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000343322753906,
+      "sampling/importance_sampling_ratio/min": 2.4061378098849673e-06,
+      "sampling/sampling_logp_difference/max": 12.937487602233887,
+      "sampling/sampling_logp_difference/mean": 0.0214434452354908,
+      "step": 508
+    },
+    {
+      "clip_ratio/high_max": 7.764184829284204e-06,
+      "clip_ratio/high_mean": 1.941046207321051e-06,
+      "clip_ratio/low_mean": 2.4530202267669665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6471248474990716e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15923.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6469.9765625,
+      "completions/mean_terminated_length": 6469.9765625,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "entropy": 0.8812271729111671,
+      "epoch": 0.46826126954921804,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020800349302589893,
+      "learning_rate": 1e-05,
+      "loss": 0.0592,
+      "num_tokens": 428379026.0,
+      "reward": 0.546875,
+      "reward_std": 0.2869548797607422,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999022483825684,
+      "sampling/importance_sampling_ratio/min": 9.611256973585114e-05,
+      "sampling/sampling_logp_difference/max": 9.249990463256836,
+      "sampling/sampling_logp_difference/mean": 0.01902790367603302,
+      "step": 509
+    },
+    {
+      "clip_ratio/high_max": 3.3670939956209622e-06,
+      "clip_ratio/high_mean": 8.417734989052406e-07,
+      "clip_ratio/low_mean": 3.1169882220183354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.201165577593201e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16189.0,
+      "completions/mean_length": 7417.2421875,
+      "completions/mean_terminated_length": 7346.6376953125,
+      "completions/min_length": 82.0,
+      "completions/min_terminated_length": 82.0,
+      "entropy": 1.0124703496694565,
+      "epoch": 0.46918123275068996,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0013554802862927318,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 429347777.0,
+      "reward": 0.359375,
+      "reward_std": 0.24039676785469055,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368190765381,
+      "sampling/importance_sampling_ratio/min": 1.086339216271881e-05,
+      "sampling/sampling_logp_difference/max": 11.4301118850708,
+      "sampling/sampling_logp_difference/mean": 0.02034895122051239,
+      "step": 510
+    },
+    {
+      "clip_ratio/high_max": 2.4966960609162925e-05,
+      "clip_ratio/high_mean": 6.241740152290731e-06,
+      "clip_ratio/low_mean": 2.400768698862521e-05,
+      "clip_ratio/low_min": 7.9038825333555e-06,
+      "clip_ratio/region_mean": 3.0249426572481752e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16108.0,
+      "completions/mean_length": 6539.7578125,
+      "completions/mean_terminated_length": 6383.50048828125,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.9707148522138596,
+      "epoch": 0.47010119595216193,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016008630627766252,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 430203402.0,
+      "reward": 0.5078125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999600648880005,
+      "sampling/importance_sampling_ratio/min": 1.7258255269325673e-08,
+      "sampling/sampling_logp_difference/max": 17.874975204467773,
+      "sampling/sampling_logp_difference/mean": 0.01951115019619465,
+      "step": 511
+    },
+    {
+      "clip_ratio/high_max": 7.0406667873612605e-06,
+      "clip_ratio/high_mean": 1.7601666968403151e-06,
+      "clip_ratio/low_mean": 2.4132358305450907e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5892525002291222e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6722.53125,
+      "completions/mean_terminated_length": 6329.78857421875,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.9293247908353806,
+      "epoch": 0.47102115915363385,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002203655894845724,
+      "learning_rate": 1e-05,
+      "loss": 0.0451,
+      "num_tokens": 431082350.0,
+      "reward": 0.46875,
+      "reward_std": 0.18543371558189392,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999996542930603,
+      "sampling/importance_sampling_ratio/min": 0.002989979926496744,
+      "sampling/sampling_logp_difference/max": 5.812488555908203,
+      "sampling/sampling_logp_difference/mean": 0.018750539049506187,
+      "step": 512
+    },
+    {
+      "clip_ratio/high_max": 5.424876235338161e-06,
+      "clip_ratio/high_mean": 1.3562190588345402e-06,
+      "clip_ratio/low_mean": 2.538728870149498e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.674350776032952e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15874.0,
+      "completions/mean_length": 6347.03125,
+      "completions/mean_terminated_length": 5766.3798828125,
+      "completions/min_length": 514.0,
+      "completions/min_terminated_length": 514.0,
+      "entropy": 0.9512053951621056,
+      "epoch": 0.47194112235510577,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002207641489803791,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 431914122.0,
+      "reward": 0.4765625,
+      "reward_std": 0.21648237109184265,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999993085861206,
+      "sampling/importance_sampling_ratio/min": 0.0011340104974806309,
+      "sampling/sampling_logp_difference/max": 6.781994819641113,
+      "sampling/sampling_logp_difference/mean": 0.01931341364979744,
+      "step": 513
+    },
+    {
+      "clip_ratio/high_max": 1.2328315506238141e-05,
+      "clip_ratio/high_mean": 3.0820788765595353e-06,
+      "clip_ratio/low_mean": 4.058695458297734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.366903374375397e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14757.0,
+      "completions/mean_length": 5719.8671875,
+      "completions/mean_terminated_length": 5635.8974609375,
+      "completions/min_length": 14.0,
+      "completions/min_terminated_length": 14.0,
+      "entropy": 0.9754309803247452,
+      "epoch": 0.47286108555657774,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018057655543088913,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 432663249.0,
+      "reward": 0.4921875,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999573230743408,
+      "sampling/importance_sampling_ratio/min": 0.00016155402408912778,
+      "sampling/sampling_logp_difference/max": 8.730670928955078,
+      "sampling/sampling_logp_difference/mean": 0.019999589771032333,
+      "step": 514
+    },
+    {
+      "clip_ratio/high_max": 3.34771721099969e-05,
+      "clip_ratio/high_mean": 8.369293027499225e-06,
+      "clip_ratio/low_mean": 3.319342158647487e-05,
+      "clip_ratio/low_min": 3.644846174211125e-06,
+      "clip_ratio/region_mean": 4.1562714159226744e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16206.0,
+      "completions/mean_length": 5969.1328125,
+      "completions/mean_terminated_length": 5803.81787109375,
+      "completions/min_length": 367.0,
+      "completions/min_terminated_length": 367.0,
+      "entropy": 0.9498241171240807,
+      "epoch": 0.47378104875804966,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002828414784744382,
+      "learning_rate": 1e-05,
+      "loss": 0.0843,
+      "num_tokens": 433448874.0,
+      "reward": 0.4375,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999672174453735,
+      "sampling/importance_sampling_ratio/min": 0.00043074661516584456,
+      "sampling/sampling_logp_difference/max": 7.749990463256836,
+      "sampling/sampling_logp_difference/mean": 0.019238140434026718,
+      "step": 515
+    },
+    {
+      "clip_ratio/high_max": 2.4458067855448462e-05,
+      "clip_ratio/high_mean": 7.50266553950496e-06,
+      "clip_ratio/low_mean": 4.7241341690096306e-05,
+      "clip_ratio/low_min": 4.075511242263019e-06,
+      "clip_ratio/region_mean": 5.4744006320106564e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14714.0,
+      "completions/max_terminated_length": 14714.0,
+      "completions/mean_length": 6808.3671875,
+      "completions/mean_terminated_length": 6808.3671875,
+      "completions/min_length": 857.0,
+      "completions/min_terminated_length": 857.0,
+      "entropy": 0.9247330650687218,
+      "epoch": 0.47470101195952163,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0019250004552304745,
+      "learning_rate": 1e-05,
+      "loss": 0.0535,
+      "num_tokens": 434338609.0,
+      "reward": 0.4921875,
+      "reward_std": 0.36007601022720337,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999670386314392,
+      "sampling/importance_sampling_ratio/min": 0.00025917106540873647,
+      "sampling/sampling_logp_difference/max": 8.25802230834961,
+      "sampling/sampling_logp_difference/mean": 0.01927364431321621,
+      "step": 516
+    },
+    {
+      "clip_ratio/high_max": 2.067027617158601e-05,
+      "clip_ratio/high_mean": 5.167569042896503e-06,
+      "clip_ratio/low_mean": 1.523887078747066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0406439944054e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15500.0,
+      "completions/mean_length": 6119.921875,
+      "completions/mean_terminated_length": 6039.1025390625,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 0.9210109040141106,
+      "epoch": 0.47562097516099355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0022343189921230078,
+      "learning_rate": 1e-05,
+      "loss": 0.0405,
+      "num_tokens": 435145247.0,
+      "reward": 0.5,
+      "reward_std": 0.2467075139284134,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998999834060669,
+      "sampling/importance_sampling_ratio/min": 0.00011216365965083241,
+      "sampling/sampling_logp_difference/max": 9.095551490783691,
+      "sampling/sampling_logp_difference/mean": 0.019618261605501175,
+      "step": 517
+    },
+    {
+      "clip_ratio/high_max": 1.9286600036139134e-05,
+      "clip_ratio/high_mean": 4.821650009034784e-06,
+      "clip_ratio/low_mean": 3.679497240227647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1616622866058606e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16282.0,
+      "completions/mean_length": 6259.0625,
+      "completions/mean_terminated_length": 6179.33837890625,
+      "completions/min_length": 1087.0,
+      "completions/min_terminated_length": 1087.0,
+      "entropy": 0.9430939853191376,
+      "epoch": 0.4765409383624655,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00324260420165956,
+      "learning_rate": 1e-05,
+      "loss": 0.0634,
+      "num_tokens": 435964383.0,
+      "reward": 0.5,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999647736549377,
+      "sampling/importance_sampling_ratio/min": 1.5690335203544237e-05,
+      "sampling/sampling_logp_difference/max": 11.06246566772461,
+      "sampling/sampling_logp_difference/mean": 0.019678015261888504,
+      "step": 518
+    },
+    {
+      "clip_ratio/high_max": 5.182851054996718e-06,
+      "clip_ratio/high_mean": 1.2957127637491794e-06,
+      "clip_ratio/low_mean": 3.5416796038134635e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6712508745040395e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14682.0,
+      "completions/mean_length": 6898.8671875,
+      "completions/mean_terminated_length": 6748.31005859375,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9633238166570663,
+      "epoch": 0.47746090156393745,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0017788221593946218,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 436866830.0,
+      "reward": 0.328125,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000014305114746,
+      "sampling/importance_sampling_ratio/min": 0.007227231748402119,
+      "sampling/sampling_logp_difference/max": 4.929899215698242,
+      "sampling/sampling_logp_difference/mean": 0.019975006580352783,
+      "step": 519
+    },
+    {
+      "clip_ratio/high_max": 1.8337552319280803e-05,
+      "clip_ratio/high_mean": 4.584388079820201e-06,
+      "clip_ratio/low_mean": 3.3715954828039685e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8300342453112535e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16277.0,
+      "completions/mean_length": 6568.8359375,
+      "completions/mean_terminated_length": 6333.2724609375,
+      "completions/min_length": 101.0,
+      "completions/min_terminated_length": 101.0,
+      "entropy": 0.9648878574371338,
+      "epoch": 0.47838086476540936,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021614902652800083,
+      "learning_rate": 1e-05,
+      "loss": 0.079,
+      "num_tokens": 437728081.0,
+      "reward": 0.4140625,
+      "reward_std": 0.24487745761871338,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999655485153198,
+      "sampling/importance_sampling_ratio/min": 0.001384100178256631,
+      "sampling/sampling_logp_difference/max": 6.582705020904541,
+      "sampling/sampling_logp_difference/mean": 0.019699109718203545,
+      "step": 520
+    },
+    {
+      "clip_ratio/high_max": 1.9740967672987608e-05,
+      "clip_ratio/high_mean": 4.935241918246902e-06,
+      "clip_ratio/low_mean": 5.360748559724016e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.0295990477970918e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16261.0,
+      "completions/mean_length": 6709.7265625,
+      "completions/mean_terminated_length": 6233.9423828125,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.791545994579792,
+      "epoch": 0.47930082796688134,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002030634554103017,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 438605294.0,
+      "reward": 0.5,
+      "reward_std": 0.2435920089483261,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999866485595703,
+      "sampling/importance_sampling_ratio/min": 0.00981139950454235,
+      "sampling/sampling_logp_difference/max": 4.624210357666016,
+      "sampling/sampling_logp_difference/mean": 0.01805954799056053,
+      "step": 521
+    },
+    {
+      "clip_ratio/high_max": 7.663652240808005e-06,
+      "clip_ratio/high_mean": 1.9159130602020014e-06,
+      "clip_ratio/low_mean": 2.266609857315416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4582011747042998e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6556.9140625,
+      "completions/mean_terminated_length": 6400.9287109375,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.886083297431469,
+      "epoch": 0.48022079116835326,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014125843299552798,
+      "learning_rate": 1e-05,
+      "loss": 0.0634,
+      "num_tokens": 439462971.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3158818185329437,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999947726726532,
+      "sampling/importance_sampling_ratio/min": 3.454853825246573e-08,
+      "sampling/sampling_logp_difference/max": 17.18090057373047,
+      "sampling/sampling_logp_difference/mean": 0.018355879932641983,
+      "step": 522
+    },
+    {
+      "clip_ratio/high_max": 9.186456281895516e-06,
+      "clip_ratio/high_mean": 2.296614070473879e-06,
+      "clip_ratio/low_mean": 3.2019113405112876e-05,
+      "clip_ratio/low_min": 4.055676527059404e-06,
+      "clip_ratio/region_mean": 3.431572758927359e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 6152.4921875,
+      "completions/mean_terminated_length": 6071.92919921875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9536242336034775,
+      "epoch": 0.48114075436982523,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00171169254463166,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 440268882.0,
+      "reward": 0.484375,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99989914894104,
+      "sampling/importance_sampling_ratio/min": 0.03775034472346306,
+      "sampling/sampling_logp_difference/max": 3.2767605781555176,
+      "sampling/sampling_logp_difference/mean": 0.018800247460603714,
+      "step": 523
+    },
+    {
+      "clip_ratio/high_max": 8.734396942600142e-06,
+      "clip_ratio/high_mean": 2.1835992356500356e-06,
+      "clip_ratio/low_mean": 4.899439159089525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.117799059917161e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15578.0,
+      "completions/mean_length": 5740.796875,
+      "completions/mean_terminated_length": 5656.9921875,
+      "completions/min_length": 731.0,
+      "completions/min_terminated_length": 731.0,
+      "entropy": 0.9311753436923027,
+      "epoch": 0.48206071757129715,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.002236112719401717,
+      "learning_rate": 1e-05,
+      "loss": 0.1033,
+      "num_tokens": 441020904.0,
+      "reward": 0.5078125,
+      "reward_std": 0.34353315830230713,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980628490448,
+      "sampling/importance_sampling_ratio/min": 0.09267321974039078,
+      "sampling/sampling_logp_difference/max": 2.378675699234009,
+      "sampling/sampling_logp_difference/mean": 0.018967337906360626,
+      "step": 524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9387059296605003e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9387059296605003e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15338.0,
+      "completions/max_terminated_length": 15338.0,
+      "completions/mean_length": 7279.078125,
+      "completions/mean_terminated_length": 7279.078125,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "entropy": 1.170717716217041,
+      "epoch": 0.48298068077276907,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0011770959245041013,
+      "learning_rate": 1e-05,
+      "loss": 0.0173,
+      "num_tokens": 441970986.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2382800281047821,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999333620071411,
+      "sampling/importance_sampling_ratio/min": 1.1565300155780278e-05,
+      "sampling/sampling_logp_difference/max": 11.367501258850098,
+      "sampling/sampling_logp_difference/mean": 0.02134837955236435,
+      "step": 525
+    },
+    {
+      "clip_ratio/high_max": 1.838239040807821e-05,
+      "clip_ratio/high_mean": 4.595597602019552e-06,
+      "clip_ratio/low_mean": 3.5013973274544696e-05,
+      "clip_ratio/low_min": 4.0234326661447994e-06,
+      "clip_ratio/region_mean": 3.960957087656425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15485.0,
+      "completions/mean_length": 7376.796875,
+      "completions/mean_terminated_length": 7233.82568359375,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "entropy": 1.0409907028079033,
+      "epoch": 0.48390064397424104,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002001611515879631,
+      "learning_rate": 1e-05,
+      "loss": 0.0362,
+      "num_tokens": 442936808.0,
+      "reward": 0.4453125,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999234676361084,
+      "sampling/importance_sampling_ratio/min": 0.003353495616465807,
+      "sampling/sampling_logp_difference/max": 5.697751998901367,
+      "sampling/sampling_logp_difference/mean": 0.02169732004404068,
+      "step": 526
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.393580459487566e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.393580459487566e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15883.0,
+      "completions/mean_length": 6904.515625,
+      "completions/mean_terminated_length": 6829.8740234375,
+      "completions/min_length": 1159.0,
+      "completions/min_terminated_length": 1159.0,
+      "entropy": 0.9905650988221169,
+      "epoch": 0.48482060717571296,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023104713764041662,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 443843010.0,
+      "reward": 0.3515625,
+      "reward_std": 0.226732075214386,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.0020711510442197323,
+      "sampling/sampling_logp_difference/max": 6.179650783538818,
+      "sampling/sampling_logp_difference/mean": 0.020169749855995178,
+      "step": 527
+    },
+    {
+      "clip_ratio/high_max": 3.274137043263181e-06,
+      "clip_ratio/high_mean": 8.185342608157953e-07,
+      "clip_ratio/low_mean": 3.806211361734313e-05,
+      "clip_ratio/low_min": 4.1808816604316235e-06,
+      "clip_ratio/region_mean": 3.8880647935002344e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15035.0,
+      "completions/max_terminated_length": 15035.0,
+      "completions/mean_length": 6611.21875,
+      "completions/mean_terminated_length": 6611.21875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 0.8890361413359642,
+      "epoch": 0.48574057037718493,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032739758025854826,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 444709854.0,
+      "reward": 0.4140625,
+      "reward_std": 0.30327799916267395,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999891996383667,
+      "sampling/importance_sampling_ratio/min": 0.00029604812152683735,
+      "sampling/sampling_logp_difference/max": 8.124988555908203,
+      "sampling/sampling_logp_difference/mean": 0.018246350809931755,
+      "step": 528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.133989605430543e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.133989605430543e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15999.0,
+      "completions/mean_length": 6928.296875,
+      "completions/mean_terminated_length": 6853.84228515625,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.9614408612251282,
+      "epoch": 0.48666053357865685,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018802061676979065,
+      "learning_rate": 1e-05,
+      "loss": 0.0528,
+      "num_tokens": 445614284.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999129176139832,
+      "sampling/importance_sampling_ratio/min": 0.02033112570643425,
+      "sampling/sampling_logp_difference/max": 3.895602226257324,
+      "sampling/sampling_logp_difference/mean": 0.019618764519691467,
+      "step": 529
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9743174675568298e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9743174675568298e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16194.0,
+      "completions/mean_length": 7946.8671875,
+      "completions/mean_terminated_length": 7812.94482421875,
+      "completions/min_length": 540.0,
+      "completions/min_terminated_length": 540.0,
+      "entropy": 0.9987246319651604,
+      "epoch": 0.48758049678012877,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002164191100746393,
+      "learning_rate": 1e-05,
+      "loss": 0.0192,
+      "num_tokens": 446649731.0,
+      "reward": 0.453125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999844431877136,
+      "sampling/importance_sampling_ratio/min": 0.0018519347067922354,
+      "sampling/sampling_logp_difference/max": 6.291524410247803,
+      "sampling/sampling_logp_difference/mean": 0.020579926669597626,
+      "step": 530
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4596658477094024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4596658477094024e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14446.0,
+      "completions/mean_length": 6763.53125,
+      "completions/mean_terminated_length": 6532.64013671875,
+      "completions/min_length": 834.0,
+      "completions/min_terminated_length": 834.0,
+      "entropy": 0.9593042582273483,
+      "epoch": 0.48850045998160074,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002090689493343234,
+      "learning_rate": 1e-05,
+      "loss": 0.0375,
+      "num_tokens": 447536311.0,
+      "reward": 0.3515625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999126195907593,
+      "sampling/importance_sampling_ratio/min": 0.014640630222856998,
+      "sampling/sampling_logp_difference/max": 4.223954677581787,
+      "sampling/sampling_logp_difference/mean": 0.019683964550495148,
+      "step": 531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.527509309402376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.527509309402376e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15805.0,
+      "completions/mean_length": 7394.40625,
+      "completions/mean_terminated_length": 7323.6220703125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "entropy": 1.0184528306126595,
+      "epoch": 0.48942042318307266,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.002562359906733036,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 448505707.0,
+      "reward": 0.2578125,
+      "reward_std": 0.17123225331306458,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560713768005,
+      "sampling/importance_sampling_ratio/min": 0.0002687747764866799,
+      "sampling/sampling_logp_difference/max": 8.221636772155762,
+      "sampling/sampling_logp_difference/mean": 0.020989736542105675,
+      "step": 532
+    },
+    {
+      "clip_ratio/high_max": 4.772085048898589e-06,
+      "clip_ratio/high_mean": 1.1930212622246472e-06,
+      "clip_ratio/low_mean": 2.0207754744205886e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.140077623380421e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16360.0,
+      "completions/mean_length": 7196.328125,
+      "completions/mean_terminated_length": 6822.84521484375,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 1.0106298848986626,
+      "epoch": 0.49034038638454464,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017445285338908434,
+      "learning_rate": 1e-05,
+      "loss": 0.0153,
+      "num_tokens": 449443709.0,
+      "reward": 0.296875,
+      "reward_std": 0.21436558663845062,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999077320098877,
+      "sampling/importance_sampling_ratio/min": 0.0012854337692260742,
+      "sampling/sampling_logp_difference/max": 6.656659126281738,
+      "sampling/sampling_logp_difference/mean": 0.021059826016426086,
+      "step": 533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.0835892605173285e-05,
+      "clip_ratio/low_min": 3.619411700128694e-06,
+      "clip_ratio/region_mean": 4.0835892605173285e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 7418.3515625,
+      "completions/mean_terminated_length": 7203.17626953125,
+      "completions/min_length": 1445.0,
+      "completions/min_terminated_length": 1445.0,
+      "entropy": 1.002836562693119,
+      "epoch": 0.49126034958601655,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0015701872762292624,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 450412866.0,
+      "reward": 0.328125,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999527335166931,
+      "sampling/importance_sampling_ratio/min": 8.191307279048488e-05,
+      "sampling/sampling_logp_difference/max": 9.409852027893066,
+      "sampling/sampling_logp_difference/mean": 0.020907817408442497,
+      "step": 534
+    },
+    {
+      "clip_ratio/high_max": 1.0691738907553372e-05,
+      "clip_ratio/high_mean": 4.761823504395579e-06,
+      "clip_ratio/low_mean": 9.472978547364619e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 9.949160914857202e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14089.0,
+      "completions/mean_length": 7007.109375,
+      "completions/mean_terminated_length": 6782.064453125,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.9748141467571259,
+      "epoch": 0.4921803127874885,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003912154585123062,
+      "learning_rate": 1e-05,
+      "loss": 0.055,
+      "num_tokens": 451331560.0,
+      "reward": 0.453125,
+      "reward_std": 0.25354722142219543,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9994460344314575,
+      "sampling/importance_sampling_ratio/min": 1.125945416902141e-07,
+      "sampling/sampling_logp_difference/max": 15.999472618103027,
+      "sampling/sampling_logp_difference/mean": 0.026503996923565865,
+      "step": 535
+    },
+    {
+      "clip_ratio/high_max": 1.5173390238487627e-05,
+      "clip_ratio/high_mean": 3.793347559621907e-06,
+      "clip_ratio/low_mean": 3.870478303724667e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.249813082424225e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15336.0,
+      "completions/mean_length": 6605.5,
+      "completions/mean_terminated_length": 6290.064453125,
+      "completions/min_length": 581.0,
+      "completions/min_terminated_length": 581.0,
+      "entropy": 0.9742915332317352,
+      "epoch": 0.49310027598896045,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0029959778767079115,
+      "learning_rate": 1e-05,
+      "loss": 0.0195,
+      "num_tokens": 452197568.0,
+      "reward": 0.46875,
+      "reward_std": 0.3180162310600281,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998653531074524,
+      "sampling/importance_sampling_ratio/min": 0.0005176665727049112,
+      "sampling/sampling_logp_difference/max": 7.566179275512695,
+      "sampling/sampling_logp_difference/mean": 0.019547434523701668,
+      "step": 536
+    },
+    {
+      "clip_ratio/high_max": 4.233987056068145e-06,
+      "clip_ratio/high_mean": 1.0584967640170362e-06,
+      "clip_ratio/low_mean": 3.348358245602867e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.454207922004571e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 6091.828125,
+      "completions/mean_terminated_length": 6010.78759765625,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 0.9893068373203278,
+      "epoch": 0.49402023919043236,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027553467079997063,
+      "learning_rate": 1e-05,
+      "loss": 0.064,
+      "num_tokens": 452995762.0,
+      "reward": 0.3671875,
+      "reward_std": 0.22437798976898193,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000625848770142,
+      "sampling/importance_sampling_ratio/min": 1.8432530168865924e-08,
+      "sampling/sampling_logp_difference/max": 17.80914878845215,
+      "sampling/sampling_logp_difference/mean": 0.02093922719359398,
+      "step": 537
+    },
+    {
+      "clip_ratio/high_max": 2.9927550940556102e-05,
+      "clip_ratio/high_mean": 7.481887735139026e-06,
+      "clip_ratio/low_mean": 5.346296995867306e-05,
+      "clip_ratio/low_min": 5.110593065182911e-06,
+      "clip_ratio/region_mean": 6.094485820540285e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16080.0,
+      "completions/mean_length": 6864.578125,
+      "completions/mean_terminated_length": 6789.6220703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 1.005393773317337,
+      "epoch": 0.49494020239190434,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002985693048685789,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "num_tokens": 453896300.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999870777130127,
+      "sampling/importance_sampling_ratio/min": 1.8929262296296656e-05,
+      "sampling/sampling_logp_difference/max": 10.874801635742188,
+      "sampling/sampling_logp_difference/mean": 0.019800683483481407,
+      "step": 538
+    },
+    {
+      "clip_ratio/high_max": 1.2092638826288749e-05,
+      "clip_ratio/high_mean": 4.037869075546041e-06,
+      "clip_ratio/low_mean": 2.9533587621699553e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3571456697245594e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14651.0,
+      "completions/max_terminated_length": 14651.0,
+      "completions/mean_length": 5828.125,
+      "completions/mean_terminated_length": 5828.125,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 0.909324087202549,
+      "epoch": 0.49586016559337626,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003145795315504074,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 454661564.0,
+      "reward": 0.359375,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999277591705322,
+      "sampling/importance_sampling_ratio/min": 5.3384183047455736e-06,
+      "sampling/sampling_logp_difference/max": 12.140581130981445,
+      "sampling/sampling_logp_difference/mean": 0.019065624102950096,
+      "step": 539
+    },
+    {
+      "clip_ratio/high_max": 2.344680183341552e-05,
+      "clip_ratio/high_mean": 5.86170045835388e-06,
+      "clip_ratio/low_mean": 4.5576647153211525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.143834823684301e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 6213.4140625,
+      "completions/mean_terminated_length": 6051.9765625,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 0.9570266529917717,
+      "epoch": 0.49678012879484823,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026711132377386093,
+      "learning_rate": 1e-05,
+      "loss": 0.116,
+      "num_tokens": 455477577.0,
+      "reward": 0.4296875,
+      "reward_std": 0.28930407762527466,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.00041241716826334596,
+      "sampling/sampling_logp_difference/max": 7.793475151062012,
+      "sampling/sampling_logp_difference/mean": 0.01995767280459404,
+      "step": 540
+    },
+    {
+      "clip_ratio/high_max": 1.5261470707628177e-05,
+      "clip_ratio/high_mean": 3.815367676907044e-06,
+      "clip_ratio/low_mean": 3.6731302770931507e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.054667033415171e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15132.0,
+      "completions/mean_length": 7125.4140625,
+      "completions/mean_terminated_length": 7052.51171875,
+      "completions/min_length": 1374.0,
+      "completions/min_terminated_length": 1374.0,
+      "entropy": 0.9259644895792007,
+      "epoch": 0.49770009199632015,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030442574061453342,
+      "learning_rate": 1e-05,
+      "loss": 0.1227,
+      "num_tokens": 456408966.0,
+      "reward": 0.484375,
+      "reward_std": 0.3816363215446472,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999846816062927,
+      "sampling/importance_sampling_ratio/min": 0.00023056140344124287,
+      "sampling/sampling_logp_difference/max": 8.374993324279785,
+      "sampling/sampling_logp_difference/mean": 0.020200349390506744,
+      "step": 541
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.665321148422663e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.665321148422663e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15909.0,
+      "completions/mean_length": 6472.1640625,
+      "completions/mean_terminated_length": 6314.83349609375,
+      "completions/min_length": 80.0,
+      "completions/min_terminated_length": 80.0,
+      "entropy": 0.8606229647994041,
+      "epoch": 0.49862005519779207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002203581389039755,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 457257011.0,
+      "reward": 0.453125,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998785853385925,
+      "sampling/importance_sampling_ratio/min": 8.579161658417434e-05,
+      "sampling/sampling_logp_difference/max": 9.3635892868042,
+      "sampling/sampling_logp_difference/mean": 0.018575064837932587,
+      "step": 542
+    },
+    {
+      "clip_ratio/high_max": 1.1763763723138254e-05,
+      "clip_ratio/high_mean": 2.9409409307845635e-06,
+      "clip_ratio/low_mean": 2.8100045369683357e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.104098641415476e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16072.0,
+      "completions/max_terminated_length": 16072.0,
+      "completions/mean_length": 7154.0,
+      "completions/mean_terminated_length": 7154.0,
+      "completions/min_length": 920.0,
+      "completions/min_terminated_length": 920.0,
+      "entropy": 0.977513425052166,
+      "epoch": 0.49954001839926404,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.001689116470515728,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 458196355.0,
+      "reward": 0.40625,
+      "reward_std": 0.18543371558189392,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999965250492096,
+      "sampling/importance_sampling_ratio/min": 0.00029606535099446774,
+      "sampling/sampling_logp_difference/max": 8.124930381774902,
+      "sampling/sampling_logp_difference/mean": 0.0198836512863636,
+      "step": 543
+    },
+    {
+      "clip_ratio/high_max": 1.1758888149415725e-05,
+      "clip_ratio/high_mean": 2.9397220373539312e-06,
+      "clip_ratio/low_mean": 4.075526112501393e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.369498378764547e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16003.0,
+      "completions/mean_length": 6878.7265625,
+      "completions/mean_terminated_length": 6727.849609375,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.9291028156876564,
+      "epoch": 0.500459981600736,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001968112075701356,
+      "learning_rate": 1e-05,
+      "loss": 0.0448,
+      "num_tokens": 459095320.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30274122953414917,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00014571755309589207,
+      "sampling/sampling_logp_difference/max": 8.833840370178223,
+      "sampling/sampling_logp_difference/mean": 0.019927173852920532,
+      "step": 544
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.1461796147123096e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.1461796147123096e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15599.0,
+      "completions/mean_length": 7187.96875,
+      "completions/mean_terminated_length": 7042.00048828125,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "entropy": 1.1720879971981049,
+      "epoch": 0.5013799448022079,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002588641829788685,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 460042660.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998769760131836,
+      "sampling/importance_sampling_ratio/min": 4.738242012081173e-07,
+      "sampling/sampling_logp_difference/max": 14.562429428100586,
+      "sampling/sampling_logp_difference/mean": 0.021826796233654022,
+      "step": 545
+    },
+    {
+      "clip_ratio/high_max": 1.55452166836767e-05,
+      "clip_ratio/high_mean": 3.886304170919175e-06,
+      "clip_ratio/low_mean": 4.735719005566352e-05,
+      "clip_ratio/low_min": 4.235134838381782e-06,
+      "clip_ratio/region_mean": 5.1243494908703724e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16019.0,
+      "completions/mean_length": 6278.078125,
+      "completions/mean_terminated_length": 6035.5361328125,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "entropy": 0.8143310993909836,
+      "epoch": 0.5022999080036799,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002047745743766427,
+      "learning_rate": 1e-05,
+      "loss": 0.064,
+      "num_tokens": 460864862.0,
+      "reward": 0.625,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999461770057678,
+      "sampling/importance_sampling_ratio/min": 0.011549573391675949,
+      "sampling/sampling_logp_difference/max": 4.461106777191162,
+      "sampling/sampling_logp_difference/mean": 0.017143042758107185,
+      "step": 546
+    },
+    {
+      "clip_ratio/high_max": 2.9079910746077076e-06,
+      "clip_ratio/high_mean": 7.269977686519269e-07,
+      "clip_ratio/low_mean": 6.497366200619581e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.224363969271508e-06,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13575.0,
+      "completions/mean_length": 5664.8828125,
+      "completions/mean_terminated_length": 5494.73828125,
+      "completions/min_length": 777.0,
+      "completions/min_terminated_length": 777.0,
+      "entropy": 0.9489249512553215,
+      "epoch": 0.5032198712051518,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002950560301542282,
+      "learning_rate": 1e-05,
+      "loss": 0.0867,
+      "num_tokens": 461608471.0,
+      "reward": 0.625,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999043345451355,
+      "sampling/importance_sampling_ratio/min": 1.6701715139788575e-05,
+      "sampling/sampling_logp_difference/max": 10.999999046325684,
+      "sampling/sampling_logp_difference/mean": 0.019181005656719208,
+      "step": 547
+    },
+    {
+      "clip_ratio/high_max": 1.2411757779773325e-05,
+      "clip_ratio/high_mean": 3.102939444943331e-06,
+      "clip_ratio/low_mean": 2.458288531670405e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7685824761647382e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16223.0,
+      "completions/mean_length": 6914.4375,
+      "completions/mean_terminated_length": 6839.8740234375,
+      "completions/min_length": 671.0,
+      "completions/min_terminated_length": 671.0,
+      "entropy": 0.9416745603084564,
+      "epoch": 0.5041398344066237,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0013792186509817839,
+      "learning_rate": 1e-05,
+      "loss": 0.0112,
+      "num_tokens": 462511519.0,
+      "reward": 0.3671875,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999217391014099,
+      "sampling/importance_sampling_ratio/min": 4.006533345091157e-05,
+      "sampling/sampling_logp_difference/max": 10.124999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01967109739780426,
+      "step": 548
+    },
+    {
+      "clip_ratio/high_max": 7.5066598128614714e-06,
+      "clip_ratio/high_mean": 1.8766649532153679e-06,
+      "clip_ratio/low_mean": 3.393825062403266e-05,
+      "clip_ratio/low_min": 3.3629271456447896e-06,
+      "clip_ratio/region_mean": 3.581491563409145e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 7343.296875,
+      "completions/mean_terminated_length": 7051.6611328125,
+      "completions/min_length": 564.0,
+      "completions/min_terminated_length": 564.0,
+      "entropy": 0.845381110906601,
+      "epoch": 0.5050597976080957,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028722358401864767,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 463472581.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2880156934261322,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880194664001,
+      "sampling/importance_sampling_ratio/min": 1.5694884496042505e-05,
+      "sampling/sampling_logp_difference/max": 11.062175750732422,
+      "sampling/sampling_logp_difference/mean": 0.018903033807873726,
+      "step": 549
+    },
+    {
+      "clip_ratio/high_max": 1.6802483287392533e-05,
+      "clip_ratio/high_mean": 5.505368051217374e-06,
+      "clip_ratio/low_mean": 2.8057194754183e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.356256252118328e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13716.0,
+      "completions/mean_length": 6022.4375,
+      "completions/mean_terminated_length": 5940.8505859375,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "entropy": 0.9279188066720963,
+      "epoch": 0.5059797608095676,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002812078921124339,
+      "learning_rate": 1e-05,
+      "loss": 0.0074,
+      "num_tokens": 464263709.0,
+      "reward": 0.421875,
+      "reward_std": 0.26120057702064514,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000264644622803,
+      "sampling/importance_sampling_ratio/min": 0.0008089813054539263,
+      "sampling/sampling_logp_difference/max": 7.119734764099121,
+      "sampling/sampling_logp_difference/mean": 0.01863965392112732,
+      "step": 550
+    },
+    {
+      "clip_ratio/high_max": 1.799457299966889e-05,
+      "clip_ratio/high_mean": 5.5325897960756265e-06,
+      "clip_ratio/low_mean": 3.587696073736879e-05,
+      "clip_ratio/low_min": 2.965106659758021e-06,
+      "clip_ratio/region_mean": 4.140955002185365e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16333.0,
+      "completions/mean_length": 6888.6328125,
+      "completions/mean_terminated_length": 6813.8662109375,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 1.0720202773809433,
+      "epoch": 0.5068997240110396,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001776764984242618,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 465167502.0,
+      "reward": 0.3203125,
+      "reward_std": 0.2961437702178955,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945342540741,
+      "sampling/importance_sampling_ratio/min": 0.0013267829781398177,
+      "sampling/sampling_logp_difference/max": 6.624998092651367,
+      "sampling/sampling_logp_difference/mean": 0.02100517973303795,
+      "step": 551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.568914848983695e-05,
+      "clip_ratio/low_min": 3.652834493550472e-06,
+      "clip_ratio/region_mean": 3.568914848983695e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14283.0,
+      "completions/mean_length": 6626.7578125,
+      "completions/mean_terminated_length": 6549.92919921875,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9632527679204941,
+      "epoch": 0.5078196872125115,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016460138140246272,
+      "learning_rate": 1e-05,
+      "loss": 0.0554,
+      "num_tokens": 466034535.0,
+      "reward": 0.5,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000152587890625,
+      "sampling/importance_sampling_ratio/min": 0.0002774179738480598,
+      "sampling/sampling_logp_difference/max": 8.189985275268555,
+      "sampling/sampling_logp_difference/mean": 0.020494937896728516,
+      "step": 552
+    },
+    {
+      "clip_ratio/high_max": 9.810846677282825e-06,
+      "clip_ratio/high_mean": 2.4527116693207063e-06,
+      "clip_ratio/low_mean": 2.4154636378170835e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.660734804749154e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16169.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 6685.484375,
+      "completions/mean_terminated_length": 6685.484375,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 0.9092860966920853,
+      "epoch": 0.5087396504139834,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019802958704531193,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 466911965.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 0.0020434472244232893,
+      "sampling/sampling_logp_difference/max": 6.193117141723633,
+      "sampling/sampling_logp_difference/mean": 0.02000512182712555,
+      "step": 553
+    },
+    {
+      "clip_ratio/high_max": 3.24397274198418e-06,
+      "clip_ratio/high_mean": 8.10993185496045e-07,
+      "clip_ratio/low_mean": 2.4120176362885104e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.493116954838115e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7105.1171875,
+      "completions/mean_terminated_length": 7032.05517578125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 1.046683594584465,
+      "epoch": 0.5096596136154554,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.002490658313035965,
+      "learning_rate": 1e-05,
+      "loss": 0.0077,
+      "num_tokens": 467844820.0,
+      "reward": 0.2578125,
+      "reward_std": 0.17123225331306458,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999809265136719,
+      "sampling/importance_sampling_ratio/min": 7.140394586713228e-07,
+      "sampling/sampling_logp_difference/max": 14.152327537536621,
+      "sampling/sampling_logp_difference/mean": 0.020726388320326805,
+      "step": 554
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.0303147582344536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0303147582344536e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15969.0,
+      "completions/max_terminated_length": 15969.0,
+      "completions/mean_length": 6806.5546875,
+      "completions/mean_terminated_length": 6806.5546875,
+      "completions/min_length": 605.0,
+      "completions/min_terminated_length": 605.0,
+      "entropy": 0.9514358267188072,
+      "epoch": 0.5105795768169273,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002886313945055008,
+      "learning_rate": 1e-05,
+      "loss": 0.0331,
+      "num_tokens": 468732451.0,
+      "reward": 0.3203125,
+      "reward_std": 0.23250603675842285,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999695420265198,
+      "sampling/importance_sampling_ratio/min": 3.148883251924417e-06,
+      "sampling/sampling_logp_difference/max": 12.668462753295898,
+      "sampling/sampling_logp_difference/mean": 0.019308820366859436,
+      "step": 555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.485187078742456e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.485187078742456e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16075.0,
+      "completions/mean_length": 6238.546875,
+      "completions/mean_terminated_length": 5995.05615234375,
+      "completions/min_length": 12.0,
+      "completions/min_terminated_length": 12.0,
+      "entropy": 0.9408878460526466,
+      "epoch": 0.5114995400183993,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002731110667809844,
+      "learning_rate": 1e-05,
+      "loss": 0.0622,
+      "num_tokens": 469551145.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3237774670124054,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999334812164307,
+      "sampling/importance_sampling_ratio/min": 0.000488168589072302,
+      "sampling/sampling_logp_difference/max": 7.624849796295166,
+      "sampling/sampling_logp_difference/mean": 0.01883235014975071,
+      "step": 556
+    },
+    {
+      "clip_ratio/high_max": 3.5477096389513463e-06,
+      "clip_ratio/high_mean": 8.869274097378366e-07,
+      "clip_ratio/low_mean": 2.5422534008612274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.630946141835011e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16342.0,
+      "completions/mean_length": 7354.5,
+      "completions/mean_terminated_length": 7283.4013671875,
+      "completions/min_length": 119.0,
+      "completions/min_terminated_length": 119.0,
+      "entropy": 0.9548593312501907,
+      "epoch": 0.5124195032198712,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022831051610410213,
+      "learning_rate": 1e-05,
+      "loss": 0.004,
+      "num_tokens": 470510305.0,
+      "reward": 0.4609375,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999933123588562,
+      "sampling/importance_sampling_ratio/min": 0.00029948100564070046,
+      "sampling/sampling_logp_difference/max": 8.113459587097168,
+      "sampling/sampling_logp_difference/mean": 0.020626772195100784,
+      "step": 557
+    },
+    {
+      "clip_ratio/high_max": 1.0478707963557099e-05,
+      "clip_ratio/high_mean": 2.6196769908892747e-06,
+      "clip_ratio/low_mean": 4.646405352559668e-05,
+      "clip_ratio/low_min": 9.308073458669242e-06,
+      "clip_ratio/region_mean": 4.908373023226886e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16222.0,
+      "completions/mean_length": 7481.421875,
+      "completions/mean_terminated_length": 7119.5283203125,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 0.9302244186401367,
+      "epoch": 0.5133394664213431,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0015396618982777,
+      "learning_rate": 1e-05,
+      "loss": 0.0944,
+      "num_tokens": 471486799.0,
+      "reward": 0.34375,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397993087769,
+      "sampling/importance_sampling_ratio/min": 0.0004175819631200284,
+      "sampling/sampling_logp_difference/max": 7.78102970123291,
+      "sampling/sampling_logp_difference/mean": 0.019920824095606804,
+      "step": 558
+    },
+    {
+      "clip_ratio/high_max": 1.2743131946990616e-05,
+      "clip_ratio/high_mean": 3.185782986747654e-06,
+      "clip_ratio/low_mean": 3.139938735330361e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.458517039689468e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14634.0,
+      "completions/mean_length": 7333.9375,
+      "completions/mean_terminated_length": 7042.0,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0290198475122452,
+      "epoch": 0.5142594296228151,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002122553065419197,
+      "learning_rate": 1e-05,
+      "loss": 0.0653,
+      "num_tokens": 472443991.0,
+      "reward": 0.359375,
+      "reward_std": 0.23356688022613525,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000252723693848,
+      "sampling/importance_sampling_ratio/min": 0.00011467799777165055,
+      "sampling/sampling_logp_difference/max": 9.073382377624512,
+      "sampling/sampling_logp_difference/mean": 0.020558707416057587,
+      "step": 559
+    },
+    {
+      "clip_ratio/high_max": 2.856805417650321e-05,
+      "clip_ratio/high_mean": 7.142013544125803e-06,
+      "clip_ratio/low_mean": 4.716298451512557e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.430499885505924e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16248.0,
+      "completions/mean_length": 6908.953125,
+      "completions/mean_terminated_length": 6681.55224609375,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "entropy": 0.9942271336913109,
+      "epoch": 0.515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0017197602428495884,
+      "learning_rate": 1e-05,
+      "loss": 0.1309,
+      "num_tokens": 473346577.0,
+      "reward": 0.421875,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999131560325623,
+      "sampling/importance_sampling_ratio/min": 0.00016969948774203658,
+      "sampling/sampling_logp_difference/max": 8.68148136138916,
+      "sampling/sampling_logp_difference/mean": 0.019906114786863327,
+      "step": 560
+    },
+    {
+      "clip_ratio/high_max": 2.4387230496358825e-05,
+      "clip_ratio/high_mean": 7.2725478048596415e-06,
+      "clip_ratio/low_mean": 3.3024165190909116e-05,
+      "clip_ratio/low_min": 2.9529187486332376e-06,
+      "clip_ratio/region_mean": 4.029671254102141e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 7543.046875,
+      "completions/mean_terminated_length": 7183.658203125,
+      "completions/min_length": 894.0,
+      "completions/min_terminated_length": 894.0,
+      "entropy": 0.973315916955471,
+      "epoch": 0.516099356025759,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001618197187781334,
+      "learning_rate": 1e-05,
+      "loss": 0.0434,
+      "num_tokens": 474330663.0,
+      "reward": 0.28125,
+      "reward_std": 0.28353503346443176,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999313950538635,
+      "sampling/importance_sampling_ratio/min": 2.1410157557966158e-07,
+      "sampling/sampling_logp_difference/max": 15.356815338134766,
+      "sampling/sampling_logp_difference/mean": 0.019991599023342133,
+      "step": 561
+    },
+    {
+      "clip_ratio/high_max": 1.8185269482273725e-05,
+      "clip_ratio/high_mean": 4.546317370568431e-06,
+      "clip_ratio/low_mean": 5.2758662491214636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7304980941808026e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15773.0,
+      "completions/mean_length": 7136.375,
+      "completions/mean_terminated_length": 6838.064453125,
+      "completions/min_length": 829.0,
+      "completions/min_terminated_length": 829.0,
+      "entropy": 0.8573452606797218,
+      "epoch": 0.5170193192272309,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025291196070611477,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 475262071.0,
+      "reward": 0.453125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999455213546753,
+      "sampling/importance_sampling_ratio/min": 5.8296889619668946e-05,
+      "sampling/sampling_logp_difference/max": 9.749961853027344,
+      "sampling/sampling_logp_difference/mean": 0.018726464360952377,
+      "step": 562
+    },
+    {
+      "clip_ratio/high_max": 1.9233400280427304e-05,
+      "clip_ratio/high_mean": 4.808350070106826e-06,
+      "clip_ratio/low_mean": 4.3801222432193754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.860957244545716e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 6538.765625,
+      "completions/mean_terminated_length": 6138.552734375,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8312613591551781,
+      "epoch": 0.5179392824287029,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018195402808487415,
+      "learning_rate": 1e-05,
+      "loss": 0.1266,
+      "num_tokens": 476119385.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3674348294734955,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999343156814575,
+      "sampling/importance_sampling_ratio/min": 0.005875314120203257,
+      "sampling/sampling_logp_difference/max": 5.136995792388916,
+      "sampling/sampling_logp_difference/mean": 0.018957480788230896,
+      "step": 563
+    },
+    {
+      "clip_ratio/high_max": 1.4299099348136224e-05,
+      "clip_ratio/high_mean": 3.574774837034056e-06,
+      "clip_ratio/low_mean": 2.9377598366409075e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.295237320344313e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6692.078125,
+      "completions/mean_terminated_length": 5870.72900390625,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 0.943247564136982,
+      "epoch": 0.5188592456301748,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001623075339011848,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 476995139.0,
+      "reward": 0.53125,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999014139175415,
+      "sampling/importance_sampling_ratio/min": 0.0003255821648053825,
+      "sampling/sampling_logp_difference/max": 8.029895782470703,
+      "sampling/sampling_logp_difference/mean": 0.019327864050865173,
+      "step": 564
+    },
+    {
+      "clip_ratio/high_max": 2.547848680478637e-06,
+      "clip_ratio/high_mean": 6.369621701196593e-07,
+      "clip_ratio/low_mean": 5.479312403622316e-05,
+      "clip_ratio/low_min": 8.624037718618638e-06,
+      "clip_ratio/region_mean": 5.543008592212573e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15957.0,
+      "completions/mean_length": 7118.40625,
+      "completions/mean_terminated_length": 6896.0322265625,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "entropy": 1.051003873348236,
+      "epoch": 0.5197792088316467,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034032040275633335,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 477926583.0,
+      "reward": 0.359375,
+      "reward_std": 0.30115145444869995,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.00037551531568169594,
+      "sampling/sampling_logp_difference/max": 7.887211322784424,
+      "sampling/sampling_logp_difference/mean": 0.021631836891174316,
+      "step": 565
+    },
+    {
+      "clip_ratio/high_max": 3.823331553576281e-06,
+      "clip_ratio/high_mean": 9.558328883940703e-07,
+      "clip_ratio/low_mean": 1.506989860899921e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.602573161108012e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 7555.8515625,
+      "completions/mean_terminated_length": 7415.72265625,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "entropy": 0.9771487265825272,
+      "epoch": 0.5206991720331187,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014035169733688235,
+      "learning_rate": 1e-05,
+      "loss": 0.0089,
+      "num_tokens": 478914724.0,
+      "reward": 0.1875,
+      "reward_std": 0.19673939049243927,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999145865440369,
+      "sampling/importance_sampling_ratio/min": 0.0017069041496142745,
+      "sampling/sampling_logp_difference/max": 6.373074054718018,
+      "sampling/sampling_logp_difference/mean": 0.020011281594634056,
+      "step": 566
+    },
+    {
+      "clip_ratio/high_max": 4.262138645572122e-06,
+      "clip_ratio/high_mean": 2.0894199224130716e-06,
+      "clip_ratio/low_mean": 2.9273888458192232e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1363308380605304e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 6505.671875,
+      "completions/mean_terminated_length": 6019.85205078125,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 0.9913810566067696,
+      "epoch": 0.5216191352345906,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0012457151897251606,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 479766874.0,
+      "reward": 0.3984375,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999585151672363,
+      "sampling/importance_sampling_ratio/min": 5.239284206481898e-08,
+      "sampling/sampling_logp_difference/max": 16.764495849609375,
+      "sampling/sampling_logp_difference/mean": 0.01945749670267105,
+      "step": 567
+    },
+    {
+      "clip_ratio/high_max": 4.419772267283406e-06,
+      "clip_ratio/high_mean": 1.1049430668208515e-06,
+      "clip_ratio/low_mean": 3.3968740126510966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.507368319333182e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15589.0,
+      "completions/max_terminated_length": 15589.0,
+      "completions/mean_length": 6709.96875,
+      "completions/mean_terminated_length": 6709.96875,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 1.053658738732338,
+      "epoch": 0.5225390984360626,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002912909025326371,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 480644782.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2041109800338745,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000216960906982,
+      "sampling/importance_sampling_ratio/min": 0.00010272916551912203,
+      "sampling/sampling_logp_difference/max": 9.183414459228516,
+      "sampling/sampling_logp_difference/mean": 0.020628605037927628,
+      "step": 568
+    },
+    {
+      "clip_ratio/high_max": 1.5635781892342493e-05,
+      "clip_ratio/high_mean": 5.148336185811786e-06,
+      "clip_ratio/low_mean": 7.926051148388069e-05,
+      "clip_ratio/low_min": 9.047379990079207e-06,
+      "clip_ratio/region_mean": 8.440884812443983e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15677.0,
+      "completions/max_terminated_length": 15677.0,
+      "completions/mean_length": 6712.8515625,
+      "completions/mean_terminated_length": 6712.8515625,
+      "completions/min_length": 18.0,
+      "completions/min_terminated_length": 18.0,
+      "entropy": 0.9288468211889267,
+      "epoch": 0.5234590616375345,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0028935675509274006,
+      "learning_rate": 1e-05,
+      "loss": 0.0293,
+      "num_tokens": 481525875.0,
+      "reward": 0.328125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999656677246094,
+      "sampling/importance_sampling_ratio/min": 0.0003157128521706909,
+      "sampling/sampling_logp_difference/max": 8.060677528381348,
+      "sampling/sampling_logp_difference/mean": 0.0201251357793808,
+      "step": 569
+    },
+    {
+      "clip_ratio/high_max": 1.1007121202055714e-05,
+      "clip_ratio/high_mean": 2.7517803005139285e-06,
+      "clip_ratio/low_mean": 4.98413718332813e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2593152645386e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16247.0,
+      "completions/mean_length": 7452.125,
+      "completions/mean_terminated_length": 7164.0,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "entropy": 0.8201636075973511,
+      "epoch": 0.5243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0014447550056502223,
+      "learning_rate": 1e-05,
+      "loss": 0.1068,
+      "num_tokens": 482498539.0,
+      "reward": 0.25,
+      "reward_std": 0.3145885467529297,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999352097511292,
+      "sampling/importance_sampling_ratio/min": 0.0008213221444748342,
+      "sampling/sampling_logp_difference/max": 7.104595184326172,
+      "sampling/sampling_logp_difference/mean": 0.018142810091376305,
+      "step": 570
+    },
+    {
+      "clip_ratio/high_max": 3.4893782867584378e-06,
+      "clip_ratio/high_mean": 8.723445716896094e-07,
+      "clip_ratio/low_mean": 2.5241818775612046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6114163347301655e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16242.0,
+      "completions/mean_length": 5997.6484375,
+      "completions/mean_terminated_length": 5915.8662109375,
+      "completions/min_length": 15.0,
+      "completions/min_terminated_length": 15.0,
+      "entropy": 0.9595593363046646,
+      "epoch": 0.5252989880404784,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013929647393524647,
+      "learning_rate": 1e-05,
+      "loss": -0.0018,
+      "num_tokens": 483286590.0,
+      "reward": 0.421875,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000566244125366,
+      "sampling/importance_sampling_ratio/min": 6.860717985546216e-05,
+      "sampling/sampling_logp_difference/max": 9.587113380432129,
+      "sampling/sampling_logp_difference/mean": 0.019294174388051033,
+      "step": 571
+    },
+    {
+      "clip_ratio/high_max": 1.2741817272399203e-05,
+      "clip_ratio/high_mean": 3.1854543180998007e-06,
+      "clip_ratio/low_mean": 3.2705364901630674e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.589081939026073e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15962.0,
+      "completions/mean_length": 6706.4140625,
+      "completions/mean_terminated_length": 6474.15234375,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9320398196578026,
+      "epoch": 0.5262189512419503,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0020693838596343994,
+      "learning_rate": 1e-05,
+      "loss": 0.0713,
+      "num_tokens": 484164003.0,
+      "reward": 0.4296875,
+      "reward_std": 0.30744946002960205,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999852180480957,
+      "sampling/importance_sampling_ratio/min": 0.011049352586269379,
+      "sampling/sampling_logp_difference/max": 4.505383491516113,
+      "sampling/sampling_logp_difference/mean": 0.01968679018318653,
+      "step": 572
+    },
+    {
+      "clip_ratio/high_max": 1.783004472599714e-05,
+      "clip_ratio/high_mean": 4.457511181499285e-06,
+      "clip_ratio/low_mean": 2.067615122314237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5133662290954817e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15636.0,
+      "completions/mean_length": 5317.96875,
+      "completions/mean_terminated_length": 5230.83447265625,
+      "completions/min_length": 344.0,
+      "completions/min_terminated_length": 344.0,
+      "entropy": 0.891069769859314,
+      "epoch": 0.5271389144434223,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004261080641299486,
+      "learning_rate": 1e-05,
+      "loss": 0.0528,
+      "num_tokens": 484864799.0,
+      "reward": 0.5234375,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999099969863892,
+      "sampling/importance_sampling_ratio/min": 0.00014285604993347079,
+      "sampling/sampling_logp_difference/max": 8.853672981262207,
+      "sampling/sampling_logp_difference/mean": 0.01876065693795681,
+      "step": 573
+    },
+    {
+      "clip_ratio/high_max": 6.954531272640452e-06,
+      "clip_ratio/high_mean": 1.738632818160113e-06,
+      "clip_ratio/low_mean": 4.1548010585756856e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.328664340391697e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6978.7890625,
+      "completions/mean_terminated_length": 6596.46337890625,
+      "completions/min_length": 710.0,
+      "completions/min_terminated_length": 710.0,
+      "entropy": 0.9322286397218704,
+      "epoch": 0.5280588776448942,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0013973438180983067,
+      "learning_rate": 1e-05,
+      "loss": 0.0396,
+      "num_tokens": 485779676.0,
+      "reward": 0.3125,
+      "reward_std": 0.2675113081932068,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999111890792847,
+      "sampling/importance_sampling_ratio/min": 0.00024690330610610545,
+      "sampling/sampling_logp_difference/max": 8.306513786315918,
+      "sampling/sampling_logp_difference/mean": 0.019345812499523163,
+      "step": 574
+    },
+    {
+      "clip_ratio/high_max": 1.4024310985405464e-05,
+      "clip_ratio/high_mean": 3.506077746351366e-06,
+      "clip_ratio/low_mean": 3.8480168882415455e-05,
+      "clip_ratio/low_min": 8.625057944300352e-06,
+      "clip_ratio/region_mean": 4.198624606033263e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16178.0,
+      "completions/mean_length": 6046.4921875,
+      "completions/mean_terminated_length": 5965.09423828125,
+      "completions/min_length": 997.0,
+      "completions/min_terminated_length": 997.0,
+      "entropy": 1.0245087146759033,
+      "epoch": 0.5289788408463661,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0015273626195266843,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 486574779.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998648166656494,
+      "sampling/importance_sampling_ratio/min": 0.00043810487841255963,
+      "sampling/sampling_logp_difference/max": 7.7330522537231445,
+      "sampling/sampling_logp_difference/mean": 0.01977401226758957,
+      "step": 575
+    },
+    {
+      "clip_ratio/high_max": 1.1012245522579178e-05,
+      "clip_ratio/high_mean": 2.7530613806447946e-06,
+      "clip_ratio/low_mean": 2.9637111538249883e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239017382838938e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16086.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 5987.0859375,
+      "completions/mean_terminated_length": 5987.0859375,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.9373713582754135,
+      "epoch": 0.5298988040478381,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003076995024457574,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 487366590.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24830511212348938,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000369548797607,
+      "sampling/importance_sampling_ratio/min": 0.0004714882234111428,
+      "sampling/sampling_logp_difference/max": 7.659616470336914,
+      "sampling/sampling_logp_difference/mean": 0.018766682595014572,
+      "step": 576
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 487366590,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/zero_to_fp32.py b/dapo_lorafa_20251202_173337/checkpoint-576/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_lorafa_20251202_173337/checkpoint-576/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/README.md b/dapo_milora_plus_20251201_131939/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1f03ef0451784218b16e8ef0ad1a9caf440e512
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/README.md
@@ -0,0 +1,68 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: transformers
+model_name: dapo_milora_plus_20251201_131939
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+
+# Model Card for dapo_milora_plus_20251201_131939
+
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/56v55mci) 
+
+
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+
+### Framework versions
+
+- TRL: 0.25.0
+- Transformers: 4.57.1
+- Pytorch: 2.8.0
+- Datasets: 4.4.1
+- Tokenizers: 0.22.1
+
+## Citations
+
+Cite GRPO as:
+
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+
+```
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/README.md b/dapo_milora_plus_20251201_131939/checkpoint-128/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-128/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/adapter_config.json
@@ -0,0 +1,40 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-128/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/latest b/dapo_milora_plus_20251201_131939/checkpoint-128/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b4db7fb020d9ef75e52048bf0cde7481e3ef9351
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/latest
@@ -0,0 +1 @@
+global_step128
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-128/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-128/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9752b63fab19d643d532ada018b0f2f19494a35
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json
@@ -0,0 +1,4002 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.11775528978840846,
+  "eval_steps": 500,
+  "global_step": 128,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 102643751,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-128/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-128/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/README.md b/dapo_milora_plus_20251201_131939/checkpoint-192/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-192/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/adapter_config.json
@@ -0,0 +1,40 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-192/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/latest b/dapo_milora_plus_20251201_131939/checkpoint-192/latest
new file mode 100644
index 0000000000000000000000000000000000000000..36721df7ef9c6f050f37be6e76b3d130ed5cbfc7
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/latest
@@ -0,0 +1 @@
+global_step192
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-192/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-192/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-192/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9d1b140006d37df9911f8e79bb9a416d4e546e2
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/trainer_state.json
@@ -0,0 +1,5986 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1766329346826127,
+  "eval_steps": 500,
+  "global_step": 192,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 161103384,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-192/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-192/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/README.md b/dapo_milora_plus_20251201_131939/checkpoint-256/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-256/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/adapter_config.json
@@ -0,0 +1,40 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-256/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/latest b/dapo_milora_plus_20251201_131939/checkpoint-256/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/latest
@@ -0,0 +1 @@
+global_step256
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-256/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-256/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-256/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2304c1b8b835a380d86c49270097508c0388c771
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/trainer_state.json
@@ -0,0 +1,7970 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.23551057957681693,
+  "eval_steps": 500,
+  "global_step": 256,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 9.941184316630824e-06,
+      "clip_ratio/high_mean": 2.485296079157706e-06,
+      "clip_ratio/low_mean": 2.6134909091979353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8620205910101504e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 8426.1015625,
+      "completions/mean_terminated_length": 7965.72705078125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8188603445887566,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030983765609562397,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 162199765.0,
+      "reward": 0.25,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411106109619,
+      "sampling/importance_sampling_ratio/min": 0.0009119694004766643,
+      "sampling/sampling_logp_difference/max": 6.999904155731201,
+      "sampling/sampling_logp_difference/mean": 0.02070600539445877,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 2.612139087432297e-05,
+      "clip_ratio/high_mean": 6.530347718580742e-06,
+      "clip_ratio/low_mean": 3.7853451885894174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.438379949078808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15904.0,
+      "completions/mean_length": 7154.2109375,
+      "completions/mean_terminated_length": 6856.4755859375,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "entropy": 0.9913735538721085,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003430198412388563,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 163133232.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2120065689086914,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000275373458862,
+      "sampling/importance_sampling_ratio/min": 0.00042929715709760785,
+      "sampling/sampling_logp_difference/max": 7.753361225128174,
+      "sampling/sampling_logp_difference/mean": 0.02190260961651802,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 3.1841454983805306e-06,
+      "clip_ratio/high_mean": 7.960363745951327e-07,
+      "clip_ratio/low_mean": 3.384581600585079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4641852380445926e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 7693.1328125,
+      "completions/mean_terminated_length": 7412.7822265625,
+      "completions/min_length": 1077.0,
+      "completions/min_terminated_length": 1077.0,
+      "entropy": 0.9887127950787544,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002780586015433073,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 164134393.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999028444290161,
+      "sampling/importance_sampling_ratio/min": 3.559096626304381e-07,
+      "sampling/sampling_logp_difference/max": 14.848588943481445,
+      "sampling/sampling_logp_difference/mean": 0.021110571920871735,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 9.770586984814145e-06,
+      "clip_ratio/high_mean": 5.008155312680174e-06,
+      "clip_ratio/low_mean": 5.182203130971175e-05,
+      "clip_ratio/low_min": 1.5574546068819473e-05,
+      "clip_ratio/region_mean": 5.683018616764457e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 7072.1484375,
+      "completions/mean_terminated_length": 6771.76611328125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.861792616546154,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030156150460243225,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 165063412.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998926520347595,
+      "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06,
+      "sampling/sampling_logp_difference/max": 12.999247550964355,
+      "sampling/sampling_logp_difference/mean": 0.019325289875268936,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 2.2510209873871645e-05,
+      "clip_ratio/high_mean": 6.455301331698138e-06,
+      "clip_ratio/low_mean": 6.156819108582567e-05,
+      "clip_ratio/low_min": 5.763157332694391e-06,
+      "clip_ratio/region_mean": 6.802349253121065e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15062.0,
+      "completions/mean_length": 7353.421875,
+      "completions/mean_terminated_length": 7062.11279296875,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "entropy": 0.8961873054504395,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034921523183584213,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 166024306.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.0005124400486238301,
+      "sampling/sampling_logp_difference/max": 7.576326847076416,
+      "sampling/sampling_logp_difference/mean": 0.019593238830566406,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.3040991007073899e-05,
+      "clip_ratio/high_mean": 4.292725350296678e-06,
+      "clip_ratio/low_mean": 5.347559840629401e-05,
+      "clip_ratio/low_min": 6.613406640099129e-06,
+      "clip_ratio/region_mean": 5.776832381343411e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15604.0,
+      "completions/mean_length": 7348.03125,
+      "completions/mean_terminated_length": 6903.63916015625,
+      "completions/min_length": 1619.0,
+      "completions/min_terminated_length": 1619.0,
+      "entropy": 0.824029266834259,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027784397825598717,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 166984982.0,
+      "reward": 0.40625,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 0.0010020677000284195,
+      "sampling/sampling_logp_difference/max": 6.905689716339111,
+      "sampling/sampling_logp_difference/mean": 0.01857386901974678,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 3.330808067403268e-05,
+      "clip_ratio/high_mean": 1.0969530649163062e-05,
+      "clip_ratio/low_mean": 3.2080681648949394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3050211388617754e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16358.0,
+      "completions/mean_length": 7290.4765625,
+      "completions/mean_terminated_length": 6920.82080078125,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8884479627013206,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004110465291887522,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 167936971.0,
+      "reward": 0.4375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06,
+      "sampling/sampling_logp_difference/max": 13.219663619995117,
+      "sampling/sampling_logp_difference/mean": 0.019696572795510292,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 9.77357763076725e-06,
+      "clip_ratio/high_mean": 2.4433944076918124e-06,
+      "clip_ratio/low_mean": 3.466498992565903e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.710838473125477e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 7803.625,
+      "completions/mean_terminated_length": 6833.66943359375,
+      "completions/min_length": 929.0,
+      "completions/min_terminated_length": 929.0,
+      "entropy": 0.8326860442757607,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002410614863038063,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "num_tokens": 168955683.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0008801451185718179,
+      "sampling/sampling_logp_difference/max": 7.035423755645752,
+      "sampling/sampling_logp_difference/mean": 0.018545793369412422,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.4602125929741305e-05,
+      "clip_ratio/high_mean": 3.6505314824353263e-06,
+      "clip_ratio/low_mean": 3.4781527119776e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8432058772741584e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6804.34375,
+      "completions/mean_terminated_length": 6495.322265625,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.9669496119022369,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034376555122435093,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 169845823.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31534504890441895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 1.767780588579626e-08,
+      "sampling/sampling_logp_difference/max": 17.850955963134766,
+      "sampling/sampling_logp_difference/mean": 0.020515555515885353,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 1.5814722473805887e-05,
+      "clip_ratio/high_mean": 3.953680618451472e-06,
+      "clip_ratio/low_mean": 3.574208744794305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9695768407455034e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 6827.9609375,
+      "completions/mean_terminated_length": 6105.23583984375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 0.8833946585655212,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026675171684473753,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 170738210.0,
+      "reward": 0.421875,
+      "reward_std": 0.2698654532432556,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000019907951355,
+      "sampling/importance_sampling_ratio/min": 0.002906275913119316,
+      "sampling/sampling_logp_difference/max": 5.840882778167725,
+      "sampling/sampling_logp_difference/mean": 0.019948139786720276,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.6623121837255894e-05,
+      "clip_ratio/high_mean": 4.1557804593139736e-06,
+      "clip_ratio/low_mean": 6.462372630267055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.877950727357529e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15725.0,
+      "completions/mean_length": 7377.984375,
+      "completions/mean_terminated_length": 7307.07080078125,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8881714344024658,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0039620306342840195,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 171705152.0,
+      "reward": 0.3359375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05,
+      "sampling/sampling_logp_difference/max": 10.614632606506348,
+      "sampling/sampling_logp_difference/mean": 0.01964445412158966,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 9.639111340220552e-06,
+      "clip_ratio/high_mean": 2.409777835055138e-06,
+      "clip_ratio/low_mean": 2.775239624952519e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0162174198267167e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15265.0,
+      "completions/mean_length": 6051.8828125,
+      "completions/mean_terminated_length": 5543.74560546875,
+      "completions/min_length": 819.0,
+      "completions/min_terminated_length": 819.0,
+      "entropy": 0.8851477280259132,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0040458571165800095,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 172501881.0,
+      "reward": 0.4296875,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999410510063171,
+      "sampling/importance_sampling_ratio/min": 0.0021976607386022806,
+      "sampling/sampling_logp_difference/max": 6.120361804962158,
+      "sampling/sampling_logp_difference/mean": 0.01957303285598755,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 9.72708312474424e-06,
+      "clip_ratio/high_mean": 3.529455852913088e-06,
+      "clip_ratio/low_mean": 5.158422732165491e-05,
+      "clip_ratio/low_min": 1.1939961495954776e-05,
+      "clip_ratio/region_mean": 5.5113683174567996e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7830.171875,
+      "completions/mean_terminated_length": 7409.4912109375,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.9070459827780724,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005941574461758137,
+      "learning_rate": 1e-05,
+      "loss": 0.0427,
+      "num_tokens": 173522391.0,
+      "reward": 0.34375,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000017881393433,
+      "sampling/importance_sampling_ratio/min": 0.00011712420382536948,
+      "sampling/sampling_logp_difference/max": 9.052275657653809,
+      "sampling/sampling_logp_difference/mean": 0.021295130252838135,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 5.5543214330100454e-06,
+      "clip_ratio/high_mean": 1.3885803582525114e-06,
+      "clip_ratio/low_mean": 1.718775109793569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8576331683561875e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15443.0,
+      "completions/mean_length": 7520.6796875,
+      "completions/mean_terminated_length": 6769.55078125,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.8843575045466423,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025851845275610685,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 174504534.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 0.00039556476986035705,
+      "sampling/sampling_logp_difference/max": 7.835196018218994,
+      "sampling/sampling_logp_difference/mean": 0.02016005665063858,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 1.0145481155632297e-05,
+      "clip_ratio/high_mean": 2.536370288908074e-06,
+      "clip_ratio/low_mean": 3.617897255026037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.871534295285528e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 7382.1875,
+      "completions/mean_terminated_length": 6861.42138671875,
+      "completions/min_length": 934.0,
+      "completions/min_terminated_length": 934.0,
+      "entropy": 0.916313610970974,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004170550964772701,
+      "learning_rate": 1e-05,
+      "loss": 0.047,
+      "num_tokens": 175472574.0,
+      "reward": 0.46875,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05,
+      "sampling/sampling_logp_difference/max": 10.481352806091309,
+      "sampling/sampling_logp_difference/mean": 0.020749717950820923,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.83663013963087e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.83663013963087e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 6122.453125,
+      "completions/mean_terminated_length": 6041.6533203125,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.8984386026859283,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 176275568.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 7.88934721640544e-06,
+      "sampling/sampling_logp_difference/max": 11.74999713897705,
+      "sampling/sampling_logp_difference/mean": 0.020278753712773323,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 1.4535152331518475e-05,
+      "clip_ratio/high_mean": 3.6337880828796187e-06,
+      "clip_ratio/low_mean": 4.3961883989140915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7595671958333696e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15547.0,
+      "completions/mean_length": 4983.2890625,
+      "completions/mean_terminated_length": 4709.67236328125,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.825260303914547,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004848882555961609,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 176932549.0,
+      "reward": 0.6484375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616146087646,
+      "sampling/importance_sampling_ratio/min": 1.626804078114219e-05,
+      "sampling/sampling_logp_difference/max": 11.026308059692383,
+      "sampling/sampling_logp_difference/mean": 0.017959970980882645,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.1141860795760294e-05,
+      "clip_ratio/high_mean": 2.7854651989400736e-06,
+      "clip_ratio/low_mean": 4.2418692146384274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5204157913758536e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 5766.5234375,
+      "completions/mean_terminated_length": 5511.7041015625,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.9016259610652924,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004749474115669727,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 177691752.0,
+      "reward": 0.5,
+      "reward_std": 0.2738044261932373,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000141859054565,
+      "sampling/importance_sampling_ratio/min": 8.927558155846782e-06,
+      "sampling/sampling_logp_difference/max": 11.626367568969727,
+      "sampling/sampling_logp_difference/mean": 0.019118282943964005,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 5.5243735914700665e-06,
+      "clip_ratio/high_mean": 2.1587275114143267e-06,
+      "clip_ratio/low_mean": 4.609663824339805e-05,
+      "clip_ratio/low_min": 3.983555870945565e-06,
+      "clip_ratio/region_mean": 4.8255366664307076e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15696.0,
+      "completions/mean_length": 6993.671875,
+      "completions/mean_terminated_length": 6768.30419921875,
+      "completions/min_length": 889.0,
+      "completions/min_terminated_length": 889.0,
+      "entropy": 0.9074988812208176,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004418120253831148,
+      "learning_rate": 1e-05,
+      "loss": 0.1135,
+      "num_tokens": 178603454.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000037670135498,
+      "sampling/importance_sampling_ratio/min": 0.0018135923892259598,
+      "sampling/sampling_logp_difference/max": 6.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01957814022898674,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 5.126943051436683e-06,
+      "clip_ratio/high_mean": 1.2817357628591708e-06,
+      "clip_ratio/low_mean": 2.7488794444252562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.877053032079857e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 7445.1328125,
+      "completions/mean_terminated_length": 6849.20849609375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9255013465881348,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00237120408564806,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 179577063.0,
+      "reward": 0.40625,
+      "reward_std": 0.21040897071361542,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725818634033,
+      "sampling/importance_sampling_ratio/min": 9.651589061832055e-05,
+      "sampling/sampling_logp_difference/max": 9.245802879333496,
+      "sampling/sampling_logp_difference/mean": 0.02165937051177025,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.8956294752570102e-05,
+      "clip_ratio/high_mean": 4.7390736881425255e-06,
+      "clip_ratio/low_mean": 2.6486316301088664e-05,
+      "clip_ratio/low_min": 3.516273409331916e-06,
+      "clip_ratio/region_mean": 3.122539010291803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6120.5546875,
+      "completions/mean_terminated_length": 5703.34130859375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8181199952960014,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004715202376246452,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 180380422.0,
+      "reward": 0.5,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999874472618103,
+      "sampling/importance_sampling_ratio/min": 0.004350374918431044,
+      "sampling/sampling_logp_difference/max": 5.437493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018377620726823807,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 5.594843969447538e-06,
+      "clip_ratio/high_mean": 2.376495558564784e-06,
+      "clip_ratio/low_mean": 3.4097628713425365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6474124044616474e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 6351.203125,
+      "completions/mean_terminated_length": 5857.78662109375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.8798654451966286,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003063712501898408,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 181212776.0,
+      "reward": 0.453125,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999946355819702,
+      "sampling/importance_sampling_ratio/min": 7.891544555604924e-06,
+      "sampling/sampling_logp_difference/max": 11.74971866607666,
+      "sampling/sampling_logp_difference/mean": 0.019523698836565018,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.544438988001275e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.544438988001275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14180.0,
+      "completions/mean_length": 6330.046875,
+      "completions/mean_terminated_length": 6170.46044921875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.8319354206323624,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033194730058312416,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 182041910.0,
+      "reward": 0.453125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998994469642639,
+      "sampling/importance_sampling_ratio/min": 0.00010535263572819531,
+      "sampling/sampling_logp_difference/max": 9.158197402954102,
+      "sampling/sampling_logp_difference/mean": 0.018981872126460075,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7156292415165808e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7156292415165808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15982.0,
+      "completions/mean_length": 6665.2890625,
+      "completions/mean_terminated_length": 6351.7822265625,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9336326420307159,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004492956213653088,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 182914843.0,
+      "reward": 0.3828125,
+      "reward_std": 0.14807432889938354,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 0.011399568989872932,
+      "sampling/sampling_logp_difference/max": 4.474179744720459,
+      "sampling/sampling_logp_difference/mean": 0.02088768407702446,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.2495465802639956e-05,
+      "clip_ratio/high_mean": 9.084843100026774e-06,
+      "clip_ratio/low_mean": 5.4809036328151706e-05,
+      "clip_ratio/low_min": 8.953898031904828e-06,
+      "clip_ratio/region_mean": 6.389387954186532e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16064.0,
+      "completions/mean_length": 5393.9140625,
+      "completions/mean_terminated_length": 5039.39501953125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.7864786610007286,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003816079581156373,
+      "learning_rate": 1e-05,
+      "loss": -0.004,
+      "num_tokens": 183628152.0,
+      "reward": 0.546875,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998779892921448,
+      "sampling/importance_sampling_ratio/min": 0.003246711567044258,
+      "sampling/sampling_logp_difference/max": 5.730112552642822,
+      "sampling/sampling_logp_difference/mean": 0.018448319286108017,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 8.638648068881594e-06,
+      "clip_ratio/high_mean": 2.1596620172203984e-06,
+      "clip_ratio/low_mean": 1.6896704778446292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9056366909353528e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 7161.5,
+      "completions/mean_terminated_length": 7015.111328125,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.915394201874733,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003666195785626769,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 184562352.0,
+      "reward": 0.3671875,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00025550799909979105,
+      "sampling/sampling_logp_difference/max": 8.272256851196289,
+      "sampling/sampling_logp_difference/mean": 0.019755780696868896,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 6.424931598303374e-06,
+      "clip_ratio/high_mean": 1.6062328995758435e-06,
+      "clip_ratio/low_mean": 2.49038239417132e-05,
+      "clip_ratio/low_min": 4.00025601265952e-06,
+      "clip_ratio/region_mean": 2.651005689813246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15408.0,
+      "completions/mean_length": 7957.671875,
+      "completions/mean_terminated_length": 7685.8544921875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "entropy": 1.1176252663135529,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025940234772861004,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 185606670.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999893844127655,
+      "sampling/importance_sampling_ratio/min": 0.0007622809498570859,
+      "sampling/sampling_logp_difference/max": 7.179195404052734,
+      "sampling/sampling_logp_difference/mean": 0.02338646724820137,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 1.9903963220713194e-05,
+      "clip_ratio/high_mean": 5.829163114867697e-06,
+      "clip_ratio/low_mean": 4.4742550926457625e-05,
+      "clip_ratio/low_min": 3.5803282116830815e-06,
+      "clip_ratio/region_mean": 5.057171370026481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 7060.6640625,
+      "completions/mean_terminated_length": 6759.9111328125,
+      "completions/min_length": 1460.0,
+      "completions/min_terminated_length": 1460.0,
+      "entropy": 0.9148540124297142,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004315398633480072,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 186526883.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0004585353017318994,
+      "sampling/sampling_logp_difference/max": 7.687473297119141,
+      "sampling/sampling_logp_difference/mean": 0.01967843994498253,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 1.147099328591139e-05,
+      "clip_ratio/high_mean": 2.8677483214778476e-06,
+      "clip_ratio/low_mean": 2.8967988555450574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1835736763241584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15596.0,
+      "completions/mean_length": 6649.6640625,
+      "completions/mean_terminated_length": 6416.04052734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9298559054732323,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030786178540438414,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 187397536.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000005841255188,
+      "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07,
+      "sampling/sampling_logp_difference/max": 14.929608345031738,
+      "sampling/sampling_logp_difference/mean": 0.020215414464473724,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 2.2768570943298982e-05,
+      "clip_ratio/high_mean": 5.692142735824746e-06,
+      "clip_ratio/low_mean": 3.249637484259438e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8188517464732286e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 8292.015625,
+      "completions/mean_terminated_length": 7823.8837890625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.8232023045420647,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002438523108139634,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 188477778.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.005636279005557299,
+      "sampling/sampling_logp_difference/max": 5.178531169891357,
+      "sampling/sampling_logp_difference/mean": 0.018984414637088776,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 2.0840709566982696e-05,
+      "clip_ratio/high_mean": 6.135253556749376e-06,
+      "clip_ratio/low_mean": 2.255633432923787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.869158777230041e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15991.0,
+      "completions/mean_length": 7600.9765625,
+      "completions/mean_terminated_length": 6936.71484375,
+      "completions/min_length": 995.0,
+      "completions/min_terminated_length": 995.0,
+      "entropy": 0.8689917623996735,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004773247055709362,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 189470655.0,
+      "reward": 0.40625,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.001327168894931674,
+      "sampling/sampling_logp_difference/max": 6.624707221984863,
+      "sampling/sampling_logp_difference/mean": 0.018666012212634087,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 9.837458947004052e-06,
+      "clip_ratio/high_mean": 2.459364736751013e-06,
+      "clip_ratio/low_mean": 6.463955219260242e-05,
+      "clip_ratio/low_min": 1.0895145351241808e-05,
+      "clip_ratio/region_mean": 6.70989177251613e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16215.0,
+      "completions/mean_length": 7600.34375,
+      "completions/mean_terminated_length": 6855.96630859375,
+      "completions/min_length": 1335.0,
+      "completions/min_terminated_length": 1335.0,
+      "entropy": 0.7636929750442505,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004298723768442869,
+      "learning_rate": 1e-05,
+      "loss": 0.145,
+      "num_tokens": 190462227.0,
+      "reward": 0.515625,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999310374259949,
+      "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05,
+      "sampling/sampling_logp_difference/max": 9.996363639831543,
+      "sampling/sampling_logp_difference/mean": 0.018035393208265305,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 1.4060602325116633e-05,
+      "clip_ratio/high_mean": 3.5151505812791584e-06,
+      "clip_ratio/low_mean": 2.6516039497437305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.003119024924672e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15151.0,
+      "completions/mean_length": 6512.0,
+      "completions/mean_terminated_length": 6434.267578125,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.9043584689497948,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006741553544998169,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 191312483.0,
+      "reward": 0.484375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 1.778468504198827e-05,
+      "sampling/sampling_logp_difference/max": 10.937172889709473,
+      "sampling/sampling_logp_difference/mean": 0.020878732204437256,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 1.7356085209030425e-05,
+      "clip_ratio/high_mean": 4.339021302257606e-06,
+      "clip_ratio/low_mean": 2.8831826739406097e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.317084781429003e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7178.6875,
+      "completions/mean_terminated_length": 6565.00048828125,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.8899475410580635,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00281486171297729,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 192251235.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2240736484527588,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999714493751526,
+      "sampling/importance_sampling_ratio/min": 9.012543159769848e-05,
+      "sampling/sampling_logp_difference/max": 9.314308166503906,
+      "sampling/sampling_logp_difference/mean": 0.020196784287691116,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.5558084214717383e-05,
+      "clip_ratio/high_mean": 3.889521053679346e-06,
+      "clip_ratio/low_mean": 3.0248688972278615e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.413820991227112e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15501.0,
+      "completions/max_terminated_length": 15501.0,
+      "completions/mean_length": 6602.5625,
+      "completions/mean_terminated_length": 6602.5625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.9266818463802338,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005070593673735857,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193116763.0,
+      "reward": 0.53125,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999746680259705,
+      "sampling/importance_sampling_ratio/min": 2.726537559283315e-06,
+      "sampling/sampling_logp_difference/max": 12.812478065490723,
+      "sampling/sampling_logp_difference/mean": 0.020026464015245438,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.188727416476468e-06,
+      "clip_ratio/high_mean": 1.047181854119117e-06,
+      "clip_ratio/low_mean": 2.959152834591805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.063871008635033e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6818.8828125,
+      "completions/mean_terminated_length": 6430.056640625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.874519519507885,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006362155079841614,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 194007868.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009298324585,
+      "sampling/importance_sampling_ratio/min": 0.0005216691642999649,
+      "sampling/sampling_logp_difference/max": 7.55847692489624,
+      "sampling/sampling_logp_difference/mean": 0.01943325623869896,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 9.645911177358357e-06,
+      "clip_ratio/high_mean": 2.4114777943395893e-06,
+      "clip_ratio/low_mean": 6.821557258263056e-05,
+      "clip_ratio/low_min": 1.7265090718865395e-05,
+      "clip_ratio/region_mean": 7.062705049065698e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14536.0,
+      "completions/mean_length": 5515.625,
+      "completions/mean_terminated_length": 5343.111328125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0683523043990135,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003797185141593218,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "num_tokens": 194735980.0,
+      "reward": 0.421875,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 1.137102216830499e-07,
+      "sampling/sampling_logp_difference/max": 15.989612579345703,
+      "sampling/sampling_logp_difference/mean": 0.02120930328965187,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.1971412252241862e-05,
+      "clip_ratio/high_mean": 5.4928530630604655e-06,
+      "clip_ratio/low_mean": 4.9151800567415194e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4644653801005916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 5853.546875,
+      "completions/mean_terminated_length": 5770.6298828125,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.7975900694727898,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004124365746974945,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 195504882.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000672340393066,
+      "sampling/importance_sampling_ratio/min": 0.0032877910416573286,
+      "sampling/sampling_logp_difference/max": 5.717539310455322,
+      "sampling/sampling_logp_difference/mean": 0.017819223925471306,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 7.066538728395244e-06,
+      "clip_ratio/high_mean": 2.843255515472265e-06,
+      "clip_ratio/low_mean": 5.1467116236381116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.431037175185338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6686.25,
+      "completions/mean_terminated_length": 6532.31787109375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.9018580466508865,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024995009880512953,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 196379306.0,
+      "reward": 0.421875,
+      "reward_std": 0.35824593901634216,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999300837516785,
+      "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05,
+      "sampling/sampling_logp_difference/max": 10.818918228149414,
+      "sampling/sampling_logp_difference/mean": 0.018989525735378265,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 6.652828687947476e-06,
+      "clip_ratio/high_mean": 2.5722979444253724e-06,
+      "clip_ratio/low_mean": 3.699686294567073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95691608900961e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16347.0,
+      "completions/mean_length": 7487.3359375,
+      "completions/mean_terminated_length": 7200.3466796875,
+      "completions/min_length": 1222.0,
+      "completions/min_terminated_length": 1222.0,
+      "entropy": 0.9890001565217972,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004295211285352707,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 197357397.0,
+      "reward": 0.40625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0006548459641635418,
+      "sampling/sampling_logp_difference/max": 7.33111047744751,
+      "sampling/sampling_logp_difference/mean": 0.02209121733903885,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 6.0850939007650595e-06,
+      "clip_ratio/high_mean": 1.5212734751912649e-06,
+      "clip_ratio/low_mean": 2.9443070673096372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0964344205131056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7233.484375,
+      "completions/mean_terminated_length": 6938.30615234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.9683803990483284,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003119673579931259,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 198303795.0,
+      "reward": 0.328125,
+      "reward_std": 0.23014704883098602,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000243186950684,
+      "sampling/importance_sampling_ratio/min": 0.020358745008707047,
+      "sampling/sampling_logp_difference/max": 3.89424467086792,
+      "sampling/sampling_logp_difference/mean": 0.021085180342197418,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.963812095113099e-06,
+      "clip_ratio/high_mean": 1.9909530237782747e-06,
+      "clip_ratio/low_mean": 4.031422963635123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.23051826601295e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15733.0,
+      "completions/mean_length": 6457.78125,
+      "completions/mean_terminated_length": 6300.22265625,
+      "completions/min_length": 850.0,
+      "completions/min_terminated_length": 850.0,
+      "entropy": 0.8881053999066353,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033790848683565855,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 199154735.0,
+      "reward": 0.3828125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998799562454224,
+      "sampling/importance_sampling_ratio/min": 2.872048128210736e-07,
+      "sampling/sampling_logp_difference/max": 15.063070297241211,
+      "sampling/sampling_logp_difference/mean": 0.01950821653008461,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 9.059622016138746e-06,
+      "clip_ratio/high_mean": 3.3430123380639998e-06,
+      "clip_ratio/low_mean": 2.2856192117615137e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6199204512522556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 7904.40625,
+      "completions/mean_terminated_length": 7769.81005859375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9881557524204254,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021492803934961557,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 200185643.0,
+      "reward": 0.359375,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001094341278076,
+      "sampling/importance_sampling_ratio/min": 0.001458622980862856,
+      "sampling/sampling_logp_difference/max": 6.530262470245361,
+      "sampling/sampling_logp_difference/mean": 0.021201875060796738,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 6.9962839006620925e-06,
+      "clip_ratio/high_mean": 1.7490709751655231e-06,
+      "clip_ratio/low_mean": 3.018811844412994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193718976035598e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 7414.4921875,
+      "completions/mean_terminated_length": 7414.4921875,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "entropy": 0.9571134969592094,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037221095990389585,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 201153114.0,
+      "reward": 0.4375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999958872795105,
+      "sampling/importance_sampling_ratio/min": 0.0009130563121289015,
+      "sampling/sampling_logp_difference/max": 6.99871301651001,
+      "sampling/sampling_logp_difference/mean": 0.021356744691729546,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 1.1248092050664127e-05,
+      "clip_ratio/high_mean": 2.8120230126660317e-06,
+      "clip_ratio/low_mean": 5.4354991334548686e-05,
+      "clip_ratio/low_min": 6.868132004456129e-06,
+      "clip_ratio/region_mean": 5.716701480196207e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15835.0,
+      "completions/max_terminated_length": 15835.0,
+      "completions/mean_length": 5955.953125,
+      "completions/mean_terminated_length": 5955.953125,
+      "completions/min_length": 1394.0,
+      "completions/min_terminated_length": 1394.0,
+      "entropy": 0.730999618768692,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006285305600613356,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 201933044.0,
+      "reward": 0.59375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.007535050623118877,
+      "sampling/sampling_logp_difference/max": 4.888189792633057,
+      "sampling/sampling_logp_difference/mean": 0.016975615173578262,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 7.226686648209579e-06,
+      "clip_ratio/high_mean": 3.094216481258627e-06,
+      "clip_ratio/low_mean": 4.66828214484849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.977703792974353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6923.3515625,
+      "completions/mean_terminated_length": 6458.0732421875,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9938417226076126,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005667983554303646,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 202837281.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05,
+      "sampling/sampling_logp_difference/max": 10.402952194213867,
+      "sampling/sampling_logp_difference/mean": 0.022059854120016098,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 5.2318769121484365e-06,
+      "clip_ratio/high_mean": 1.3079692280371091e-06,
+      "clip_ratio/low_mean": 4.239228087499214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3700250216716086e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14726.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 5930.9296875,
+      "completions/mean_terminated_length": 5930.9296875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.8100385963916779,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004052883945405483,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 203614448.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999989926815033,
+      "sampling/importance_sampling_ratio/min": 0.00015170808183029294,
+      "sampling/sampling_logp_difference/max": 8.79355239868164,
+      "sampling/sampling_logp_difference/mean": 0.018519222736358643,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 4.905230980511988e-06,
+      "clip_ratio/high_mean": 1.226307745127997e-06,
+      "clip_ratio/low_mean": 5.500513248080097e-05,
+      "clip_ratio/low_min": 7.924934834591113e-06,
+      "clip_ratio/region_mean": 5.6231440112242126e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14996.0,
+      "completions/mean_length": 6911.1015625,
+      "completions/mean_terminated_length": 6108.3134765625,
+      "completions/min_length": 862.0,
+      "completions/min_terminated_length": 862.0,
+      "entropy": 0.9260227829217911,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004494607914239168,
+      "learning_rate": 1e-05,
+      "loss": 0.0269,
+      "num_tokens": 204518261.0,
+      "reward": 0.4140625,
+      "reward_std": 0.34033796191215515,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998886585235596,
+      "sampling/importance_sampling_ratio/min": 0.0015266009140759706,
+      "sampling/sampling_logp_difference/max": 6.484711647033691,
+      "sampling/sampling_logp_difference/mean": 0.020527629181742668,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 8.293764039990492e-06,
+      "clip_ratio/high_mean": 2.073441009997623e-06,
+      "clip_ratio/low_mean": 4.75325257411896e-05,
+      "clip_ratio/low_min": 3.599504680096288e-06,
+      "clip_ratio/region_mean": 4.960596663750039e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14637.0,
+      "completions/mean_length": 6972.921875,
+      "completions/mean_terminated_length": 6823.5400390625,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 1.0095533654093742,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029451537411659956,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 205433843.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05,
+      "sampling/sampling_logp_difference/max": 10.53177547454834,
+      "sampling/sampling_logp_difference/mean": 0.02013089321553707,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 4.163383164268453e-05,
+      "clip_ratio/high_mean": 1.382379150527413e-05,
+      "clip_ratio/low_mean": 3.86000854177837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2423876240936806e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 6706.6640625,
+      "completions/mean_terminated_length": 6313.2763671875,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 0.8647518903017044,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003371767932549119,
+      "learning_rate": 1e-05,
+      "loss": 0.073,
+      "num_tokens": 206310296.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 2.948181463580113e-05,
+      "sampling/sampling_logp_difference/max": 10.431736946105957,
+      "sampling/sampling_logp_difference/mean": 0.019770190119743347,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4946740381892596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4946740381892596e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 6882.609375,
+      "completions/mean_terminated_length": 6415.32763671875,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "entropy": 1.013342760503292,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016336971893906593,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 207210974.0,
+      "reward": 0.359375,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999210834503174,
+      "sampling/importance_sampling_ratio/min": 0.0013267879839986563,
+      "sampling/sampling_logp_difference/max": 6.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.02139991894364357,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.4866403944324702e-05,
+      "clip_ratio/high_mean": 3.7166009860811755e-06,
+      "clip_ratio/low_mean": 3.938925010515959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.310585177336179e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15203.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 6195.7421875,
+      "completions/mean_terminated_length": 6195.7421875,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.8448907434940338,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005036406684666872,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208021893.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955892562866,
+      "sampling/importance_sampling_ratio/min": 0.0040348549373447895,
+      "sampling/sampling_logp_difference/max": 5.512784957885742,
+      "sampling/sampling_logp_difference/mean": 0.018679853528738022,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.1244883353356272e-05,
+      "clip_ratio/high_mean": 2.811220838339068e-06,
+      "clip_ratio/low_mean": 3.422392001084518e-05,
+      "clip_ratio/low_min": 6.451612989621935e-06,
+      "clip_ratio/region_mean": 3.703514119024476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6829.609375,
+      "completions/mean_terminated_length": 6521.40283203125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.8679579794406891,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029643685556948185,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 208912059.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00038063788088038564,
+      "sampling/sampling_logp_difference/max": 7.873661994934082,
+      "sampling/sampling_logp_difference/mean": 0.018488366156816483,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 2.2700600311509334e-05,
+      "clip_ratio/high_mean": 5.675150077877333e-06,
+      "clip_ratio/low_mean": 3.138338854569156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.705853873725573e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14503.0,
+      "completions/max_terminated_length": 14503.0,
+      "completions/mean_length": 5444.4453125,
+      "completions/mean_terminated_length": 5444.4453125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0460086688399315,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035942886024713516,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "num_tokens": 209627804.0,
+      "reward": 0.484375,
+      "reward_std": 0.338498055934906,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997478723526,
+      "sampling/importance_sampling_ratio/min": 0.03179635480046272,
+      "sampling/sampling_logp_difference/max": 3.4484035968780518,
+      "sampling/sampling_logp_difference/mean": 0.020146891474723816,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.477029400120955e-05,
+      "clip_ratio/high_mean": 4.552578502625693e-06,
+      "clip_ratio/low_mean": 5.265122354103369e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.720380158891203e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16244.0,
+      "completions/mean_length": 7657.390625,
+      "completions/mean_terminated_length": 7152.544921875,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 0.9528728649020195,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0044983453117311,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 210630150.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000007152557373,
+      "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05,
+      "sampling/sampling_logp_difference/max": 10.158285140991211,
+      "sampling/sampling_logp_difference/mean": 0.02131088823080063,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 8.607642712377128e-06,
+      "clip_ratio/high_mean": 2.151910678094282e-06,
+      "clip_ratio/low_mean": 2.2759413695894182e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.491132454451872e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7574.3515625,
+      "completions/mean_terminated_length": 7504.984375,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 1.0009776800870895,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006095650140196085,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 211620355.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 0.0013946897815912962,
+      "sampling/sampling_logp_difference/max": 6.575083255767822,
+      "sampling/sampling_logp_difference/mean": 0.021727774292230606,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.764823082339717e-05,
+      "clip_ratio/high_mean": 5.141430960975413e-06,
+      "clip_ratio/low_mean": 5.936152001595474e-05,
+      "clip_ratio/low_min": 9.155588486464694e-06,
+      "clip_ratio/region_mean": 6.450295177273802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14915.0,
+      "completions/mean_length": 7919.6875,
+      "completions/mean_terminated_length": 7716.54443359375,
+      "completions/min_length": 1517.0,
+      "completions/min_terminated_length": 1517.0,
+      "entropy": 1.0405654236674309,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037038614973425865,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 212654747.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381899833679,
+      "sampling/importance_sampling_ratio/min": 0.0057550109922885895,
+      "sampling/sampling_logp_difference/max": 5.157684326171875,
+      "sampling/sampling_logp_difference/mean": 0.022051017731428146,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.265254240934155e-05,
+      "clip_ratio/high_mean": 3.1631356023353874e-06,
+      "clip_ratio/low_mean": 4.716233138424286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.032546687289141e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 8613.4765625,
+      "completions/mean_terminated_length": 7735.0693359375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.890489287674427,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325607368722558,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 213774584.0,
+      "reward": 0.40625,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000060796737671,
+      "sampling/importance_sampling_ratio/min": 1.670176425250247e-05,
+      "sampling/sampling_logp_difference/max": 10.999996185302734,
+      "sampling/sampling_logp_difference/mean": 0.020002499222755432,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.6404605503339553e-05,
+      "clip_ratio/high_mean": 4.101151375834888e-06,
+      "clip_ratio/low_mean": 3.880500707964529e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2906158682853857e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7324.8984375,
+      "completions/mean_terminated_length": 6473.1884765625,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 0.761004202067852,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038265211042016745,
+      "learning_rate": 1e-05,
+      "loss": 0.0717,
+      "num_tokens": 214728371.0,
+      "reward": 0.515625,
+      "reward_std": 0.32719239592552185,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000168085098267,
+      "sampling/importance_sampling_ratio/min": 0.0003049026126973331,
+      "sampling/sampling_logp_difference/max": 8.095518112182617,
+      "sampling/sampling_logp_difference/mean": 0.018367979675531387,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 5.624549885396846e-06,
+      "clip_ratio/high_mean": 1.4061374713492114e-06,
+      "clip_ratio/low_mean": 3.6433707123251224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7839844594600436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14167.0,
+      "completions/max_terminated_length": 14167.0,
+      "completions/mean_length": 6422.0859375,
+      "completions/mean_terminated_length": 6422.0859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.9946094751358032,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002729539293795824,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 215570806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.026308411732316017,
+      "sampling/sampling_logp_difference/max": 3.637866497039795,
+      "sampling/sampling_logp_difference/mean": 0.021903935819864273,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 7.2379848461423535e-06,
+      "clip_ratio/high_mean": 1.8094962115355884e-06,
+      "clip_ratio/low_mean": 3.17277934982485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353728982347093e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15585.0,
+      "completions/mean_length": 6845.2890625,
+      "completions/mean_terminated_length": 6693.88134765625,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8822609707713127,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004974282346665859,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 216465635.0,
+      "reward": 0.5390625,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 8.749838889343664e-05,
+      "sampling/sampling_logp_difference/max": 9.343890190124512,
+      "sampling/sampling_logp_difference/mean": 0.019389234483242035,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.58592818024772e-05,
+      "clip_ratio/high_mean": 3.9648204506193e-06,
+      "clip_ratio/low_mean": 4.096964960353944e-05,
+      "clip_ratio/low_min": 1.7403560605089297e-05,
+      "clip_ratio/region_mean": 4.49344687467601e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 7805.484375,
+      "completions/mean_terminated_length": 7528.7578125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.9977599084377289,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0033159854356199503,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 217485089.0,
+      "reward": 0.421875,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999412298202515,
+      "sampling/importance_sampling_ratio/min": 7.967943383846432e-05,
+      "sampling/sampling_logp_difference/max": 9.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.021925684064626694,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.8265397557115648e-05,
+      "clip_ratio/high_mean": 4.566349389278912e-06,
+      "clip_ratio/low_mean": 4.044636898470344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5012717691861326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15681.0,
+      "completions/mean_length": 7737.5546875,
+      "completions/mean_terminated_length": 7530.04052734375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.8667014688253403,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034952745772898197,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "num_tokens": 218496040.0,
+      "reward": 0.453125,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999128580093384,
+      "sampling/importance_sampling_ratio/min": 6.726370338583365e-05,
+      "sampling/sampling_logp_difference/max": 9.606889724731445,
+      "sampling/sampling_logp_difference/mean": 0.019742710515856743,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 8.244294804171659e-06,
+      "clip_ratio/high_mean": 2.0610737010429148e-06,
+      "clip_ratio/low_mean": 3.204250072030845e-05,
+      "clip_ratio/low_min": 3.323495775475749e-06,
+      "clip_ratio/region_mean": 3.410357436450795e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15858.0,
+      "completions/mean_length": 7365.84375,
+      "completions/mean_terminated_length": 6601.59326171875,
+      "completions/min_length": 744.0,
+      "completions/min_terminated_length": 744.0,
+      "entropy": 0.8151945173740387,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038676802068948746,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 219459140.0,
+      "reward": 0.46875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00023387260443996638,
+      "sampling/sampling_logp_difference/max": 8.360733985900879,
+      "sampling/sampling_logp_difference/mean": 0.018882082775235176,
+      "step": 256
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 219459140,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-256/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-256/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/README.md b/dapo_milora_plus_20251201_131939/checkpoint-320/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-320/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/adapter_config.json
@@ -0,0 +1,40 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-320/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/latest b/dapo_milora_plus_20251201_131939/checkpoint-320/latest
new file mode 100644
index 0000000000000000000000000000000000000000..9d535587efdab3121736d8095481e4143f000213
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/latest
@@ -0,0 +1 @@
+global_step320
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-320/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-320/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-320/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebf99131405e095aadde6f9bf4b506f4e32b67d3
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/trainer_state.json
@@ -0,0 +1,9954 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.29438822447102114,
+  "eval_steps": 500,
+  "global_step": 320,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 9.941184316630824e-06,
+      "clip_ratio/high_mean": 2.485296079157706e-06,
+      "clip_ratio/low_mean": 2.6134909091979353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8620205910101504e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 8426.1015625,
+      "completions/mean_terminated_length": 7965.72705078125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8188603445887566,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030983765609562397,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 162199765.0,
+      "reward": 0.25,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411106109619,
+      "sampling/importance_sampling_ratio/min": 0.0009119694004766643,
+      "sampling/sampling_logp_difference/max": 6.999904155731201,
+      "sampling/sampling_logp_difference/mean": 0.02070600539445877,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 2.612139087432297e-05,
+      "clip_ratio/high_mean": 6.530347718580742e-06,
+      "clip_ratio/low_mean": 3.7853451885894174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.438379949078808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15904.0,
+      "completions/mean_length": 7154.2109375,
+      "completions/mean_terminated_length": 6856.4755859375,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "entropy": 0.9913735538721085,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003430198412388563,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 163133232.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2120065689086914,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000275373458862,
+      "sampling/importance_sampling_ratio/min": 0.00042929715709760785,
+      "sampling/sampling_logp_difference/max": 7.753361225128174,
+      "sampling/sampling_logp_difference/mean": 0.02190260961651802,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 3.1841454983805306e-06,
+      "clip_ratio/high_mean": 7.960363745951327e-07,
+      "clip_ratio/low_mean": 3.384581600585079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4641852380445926e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 7693.1328125,
+      "completions/mean_terminated_length": 7412.7822265625,
+      "completions/min_length": 1077.0,
+      "completions/min_terminated_length": 1077.0,
+      "entropy": 0.9887127950787544,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002780586015433073,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 164134393.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999028444290161,
+      "sampling/importance_sampling_ratio/min": 3.559096626304381e-07,
+      "sampling/sampling_logp_difference/max": 14.848588943481445,
+      "sampling/sampling_logp_difference/mean": 0.021110571920871735,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 9.770586984814145e-06,
+      "clip_ratio/high_mean": 5.008155312680174e-06,
+      "clip_ratio/low_mean": 5.182203130971175e-05,
+      "clip_ratio/low_min": 1.5574546068819473e-05,
+      "clip_ratio/region_mean": 5.683018616764457e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 7072.1484375,
+      "completions/mean_terminated_length": 6771.76611328125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.861792616546154,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030156150460243225,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 165063412.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998926520347595,
+      "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06,
+      "sampling/sampling_logp_difference/max": 12.999247550964355,
+      "sampling/sampling_logp_difference/mean": 0.019325289875268936,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 2.2510209873871645e-05,
+      "clip_ratio/high_mean": 6.455301331698138e-06,
+      "clip_ratio/low_mean": 6.156819108582567e-05,
+      "clip_ratio/low_min": 5.763157332694391e-06,
+      "clip_ratio/region_mean": 6.802349253121065e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15062.0,
+      "completions/mean_length": 7353.421875,
+      "completions/mean_terminated_length": 7062.11279296875,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "entropy": 0.8961873054504395,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034921523183584213,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 166024306.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.0005124400486238301,
+      "sampling/sampling_logp_difference/max": 7.576326847076416,
+      "sampling/sampling_logp_difference/mean": 0.019593238830566406,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.3040991007073899e-05,
+      "clip_ratio/high_mean": 4.292725350296678e-06,
+      "clip_ratio/low_mean": 5.347559840629401e-05,
+      "clip_ratio/low_min": 6.613406640099129e-06,
+      "clip_ratio/region_mean": 5.776832381343411e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15604.0,
+      "completions/mean_length": 7348.03125,
+      "completions/mean_terminated_length": 6903.63916015625,
+      "completions/min_length": 1619.0,
+      "completions/min_terminated_length": 1619.0,
+      "entropy": 0.824029266834259,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027784397825598717,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 166984982.0,
+      "reward": 0.40625,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 0.0010020677000284195,
+      "sampling/sampling_logp_difference/max": 6.905689716339111,
+      "sampling/sampling_logp_difference/mean": 0.01857386901974678,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 3.330808067403268e-05,
+      "clip_ratio/high_mean": 1.0969530649163062e-05,
+      "clip_ratio/low_mean": 3.2080681648949394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3050211388617754e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16358.0,
+      "completions/mean_length": 7290.4765625,
+      "completions/mean_terminated_length": 6920.82080078125,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8884479627013206,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004110465291887522,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 167936971.0,
+      "reward": 0.4375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06,
+      "sampling/sampling_logp_difference/max": 13.219663619995117,
+      "sampling/sampling_logp_difference/mean": 0.019696572795510292,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 9.77357763076725e-06,
+      "clip_ratio/high_mean": 2.4433944076918124e-06,
+      "clip_ratio/low_mean": 3.466498992565903e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.710838473125477e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 7803.625,
+      "completions/mean_terminated_length": 6833.66943359375,
+      "completions/min_length": 929.0,
+      "completions/min_terminated_length": 929.0,
+      "entropy": 0.8326860442757607,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002410614863038063,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "num_tokens": 168955683.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0008801451185718179,
+      "sampling/sampling_logp_difference/max": 7.035423755645752,
+      "sampling/sampling_logp_difference/mean": 0.018545793369412422,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.4602125929741305e-05,
+      "clip_ratio/high_mean": 3.6505314824353263e-06,
+      "clip_ratio/low_mean": 3.4781527119776e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8432058772741584e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6804.34375,
+      "completions/mean_terminated_length": 6495.322265625,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.9669496119022369,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034376555122435093,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 169845823.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31534504890441895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 1.767780588579626e-08,
+      "sampling/sampling_logp_difference/max": 17.850955963134766,
+      "sampling/sampling_logp_difference/mean": 0.020515555515885353,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 1.5814722473805887e-05,
+      "clip_ratio/high_mean": 3.953680618451472e-06,
+      "clip_ratio/low_mean": 3.574208744794305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9695768407455034e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 6827.9609375,
+      "completions/mean_terminated_length": 6105.23583984375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 0.8833946585655212,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026675171684473753,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 170738210.0,
+      "reward": 0.421875,
+      "reward_std": 0.2698654532432556,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000019907951355,
+      "sampling/importance_sampling_ratio/min": 0.002906275913119316,
+      "sampling/sampling_logp_difference/max": 5.840882778167725,
+      "sampling/sampling_logp_difference/mean": 0.019948139786720276,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.6623121837255894e-05,
+      "clip_ratio/high_mean": 4.1557804593139736e-06,
+      "clip_ratio/low_mean": 6.462372630267055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.877950727357529e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15725.0,
+      "completions/mean_length": 7377.984375,
+      "completions/mean_terminated_length": 7307.07080078125,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8881714344024658,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0039620306342840195,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 171705152.0,
+      "reward": 0.3359375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05,
+      "sampling/sampling_logp_difference/max": 10.614632606506348,
+      "sampling/sampling_logp_difference/mean": 0.01964445412158966,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 9.639111340220552e-06,
+      "clip_ratio/high_mean": 2.409777835055138e-06,
+      "clip_ratio/low_mean": 2.775239624952519e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0162174198267167e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15265.0,
+      "completions/mean_length": 6051.8828125,
+      "completions/mean_terminated_length": 5543.74560546875,
+      "completions/min_length": 819.0,
+      "completions/min_terminated_length": 819.0,
+      "entropy": 0.8851477280259132,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0040458571165800095,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 172501881.0,
+      "reward": 0.4296875,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999410510063171,
+      "sampling/importance_sampling_ratio/min": 0.0021976607386022806,
+      "sampling/sampling_logp_difference/max": 6.120361804962158,
+      "sampling/sampling_logp_difference/mean": 0.01957303285598755,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 9.72708312474424e-06,
+      "clip_ratio/high_mean": 3.529455852913088e-06,
+      "clip_ratio/low_mean": 5.158422732165491e-05,
+      "clip_ratio/low_min": 1.1939961495954776e-05,
+      "clip_ratio/region_mean": 5.5113683174567996e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7830.171875,
+      "completions/mean_terminated_length": 7409.4912109375,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.9070459827780724,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005941574461758137,
+      "learning_rate": 1e-05,
+      "loss": 0.0427,
+      "num_tokens": 173522391.0,
+      "reward": 0.34375,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000017881393433,
+      "sampling/importance_sampling_ratio/min": 0.00011712420382536948,
+      "sampling/sampling_logp_difference/max": 9.052275657653809,
+      "sampling/sampling_logp_difference/mean": 0.021295130252838135,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 5.5543214330100454e-06,
+      "clip_ratio/high_mean": 1.3885803582525114e-06,
+      "clip_ratio/low_mean": 1.718775109793569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8576331683561875e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15443.0,
+      "completions/mean_length": 7520.6796875,
+      "completions/mean_terminated_length": 6769.55078125,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.8843575045466423,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025851845275610685,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 174504534.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 0.00039556476986035705,
+      "sampling/sampling_logp_difference/max": 7.835196018218994,
+      "sampling/sampling_logp_difference/mean": 0.02016005665063858,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 1.0145481155632297e-05,
+      "clip_ratio/high_mean": 2.536370288908074e-06,
+      "clip_ratio/low_mean": 3.617897255026037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.871534295285528e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 7382.1875,
+      "completions/mean_terminated_length": 6861.42138671875,
+      "completions/min_length": 934.0,
+      "completions/min_terminated_length": 934.0,
+      "entropy": 0.916313610970974,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004170550964772701,
+      "learning_rate": 1e-05,
+      "loss": 0.047,
+      "num_tokens": 175472574.0,
+      "reward": 0.46875,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05,
+      "sampling/sampling_logp_difference/max": 10.481352806091309,
+      "sampling/sampling_logp_difference/mean": 0.020749717950820923,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.83663013963087e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.83663013963087e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 6122.453125,
+      "completions/mean_terminated_length": 6041.6533203125,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.8984386026859283,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 176275568.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 7.88934721640544e-06,
+      "sampling/sampling_logp_difference/max": 11.74999713897705,
+      "sampling/sampling_logp_difference/mean": 0.020278753712773323,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 1.4535152331518475e-05,
+      "clip_ratio/high_mean": 3.6337880828796187e-06,
+      "clip_ratio/low_mean": 4.3961883989140915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7595671958333696e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15547.0,
+      "completions/mean_length": 4983.2890625,
+      "completions/mean_terminated_length": 4709.67236328125,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.825260303914547,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004848882555961609,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 176932549.0,
+      "reward": 0.6484375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616146087646,
+      "sampling/importance_sampling_ratio/min": 1.626804078114219e-05,
+      "sampling/sampling_logp_difference/max": 11.026308059692383,
+      "sampling/sampling_logp_difference/mean": 0.017959970980882645,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.1141860795760294e-05,
+      "clip_ratio/high_mean": 2.7854651989400736e-06,
+      "clip_ratio/low_mean": 4.2418692146384274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5204157913758536e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 5766.5234375,
+      "completions/mean_terminated_length": 5511.7041015625,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.9016259610652924,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004749474115669727,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 177691752.0,
+      "reward": 0.5,
+      "reward_std": 0.2738044261932373,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000141859054565,
+      "sampling/importance_sampling_ratio/min": 8.927558155846782e-06,
+      "sampling/sampling_logp_difference/max": 11.626367568969727,
+      "sampling/sampling_logp_difference/mean": 0.019118282943964005,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 5.5243735914700665e-06,
+      "clip_ratio/high_mean": 2.1587275114143267e-06,
+      "clip_ratio/low_mean": 4.609663824339805e-05,
+      "clip_ratio/low_min": 3.983555870945565e-06,
+      "clip_ratio/region_mean": 4.8255366664307076e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15696.0,
+      "completions/mean_length": 6993.671875,
+      "completions/mean_terminated_length": 6768.30419921875,
+      "completions/min_length": 889.0,
+      "completions/min_terminated_length": 889.0,
+      "entropy": 0.9074988812208176,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004418120253831148,
+      "learning_rate": 1e-05,
+      "loss": 0.1135,
+      "num_tokens": 178603454.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000037670135498,
+      "sampling/importance_sampling_ratio/min": 0.0018135923892259598,
+      "sampling/sampling_logp_difference/max": 6.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01957814022898674,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 5.126943051436683e-06,
+      "clip_ratio/high_mean": 1.2817357628591708e-06,
+      "clip_ratio/low_mean": 2.7488794444252562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.877053032079857e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 7445.1328125,
+      "completions/mean_terminated_length": 6849.20849609375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9255013465881348,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00237120408564806,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 179577063.0,
+      "reward": 0.40625,
+      "reward_std": 0.21040897071361542,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725818634033,
+      "sampling/importance_sampling_ratio/min": 9.651589061832055e-05,
+      "sampling/sampling_logp_difference/max": 9.245802879333496,
+      "sampling/sampling_logp_difference/mean": 0.02165937051177025,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.8956294752570102e-05,
+      "clip_ratio/high_mean": 4.7390736881425255e-06,
+      "clip_ratio/low_mean": 2.6486316301088664e-05,
+      "clip_ratio/low_min": 3.516273409331916e-06,
+      "clip_ratio/region_mean": 3.122539010291803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6120.5546875,
+      "completions/mean_terminated_length": 5703.34130859375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8181199952960014,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004715202376246452,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 180380422.0,
+      "reward": 0.5,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999874472618103,
+      "sampling/importance_sampling_ratio/min": 0.004350374918431044,
+      "sampling/sampling_logp_difference/max": 5.437493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018377620726823807,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 5.594843969447538e-06,
+      "clip_ratio/high_mean": 2.376495558564784e-06,
+      "clip_ratio/low_mean": 3.4097628713425365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6474124044616474e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 6351.203125,
+      "completions/mean_terminated_length": 5857.78662109375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.8798654451966286,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003063712501898408,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 181212776.0,
+      "reward": 0.453125,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999946355819702,
+      "sampling/importance_sampling_ratio/min": 7.891544555604924e-06,
+      "sampling/sampling_logp_difference/max": 11.74971866607666,
+      "sampling/sampling_logp_difference/mean": 0.019523698836565018,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.544438988001275e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.544438988001275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14180.0,
+      "completions/mean_length": 6330.046875,
+      "completions/mean_terminated_length": 6170.46044921875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.8319354206323624,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033194730058312416,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 182041910.0,
+      "reward": 0.453125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998994469642639,
+      "sampling/importance_sampling_ratio/min": 0.00010535263572819531,
+      "sampling/sampling_logp_difference/max": 9.158197402954102,
+      "sampling/sampling_logp_difference/mean": 0.018981872126460075,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7156292415165808e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7156292415165808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15982.0,
+      "completions/mean_length": 6665.2890625,
+      "completions/mean_terminated_length": 6351.7822265625,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9336326420307159,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004492956213653088,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 182914843.0,
+      "reward": 0.3828125,
+      "reward_std": 0.14807432889938354,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 0.011399568989872932,
+      "sampling/sampling_logp_difference/max": 4.474179744720459,
+      "sampling/sampling_logp_difference/mean": 0.02088768407702446,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.2495465802639956e-05,
+      "clip_ratio/high_mean": 9.084843100026774e-06,
+      "clip_ratio/low_mean": 5.4809036328151706e-05,
+      "clip_ratio/low_min": 8.953898031904828e-06,
+      "clip_ratio/region_mean": 6.389387954186532e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16064.0,
+      "completions/mean_length": 5393.9140625,
+      "completions/mean_terminated_length": 5039.39501953125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.7864786610007286,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003816079581156373,
+      "learning_rate": 1e-05,
+      "loss": -0.004,
+      "num_tokens": 183628152.0,
+      "reward": 0.546875,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998779892921448,
+      "sampling/importance_sampling_ratio/min": 0.003246711567044258,
+      "sampling/sampling_logp_difference/max": 5.730112552642822,
+      "sampling/sampling_logp_difference/mean": 0.018448319286108017,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 8.638648068881594e-06,
+      "clip_ratio/high_mean": 2.1596620172203984e-06,
+      "clip_ratio/low_mean": 1.6896704778446292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9056366909353528e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 7161.5,
+      "completions/mean_terminated_length": 7015.111328125,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.915394201874733,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003666195785626769,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 184562352.0,
+      "reward": 0.3671875,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00025550799909979105,
+      "sampling/sampling_logp_difference/max": 8.272256851196289,
+      "sampling/sampling_logp_difference/mean": 0.019755780696868896,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 6.424931598303374e-06,
+      "clip_ratio/high_mean": 1.6062328995758435e-06,
+      "clip_ratio/low_mean": 2.49038239417132e-05,
+      "clip_ratio/low_min": 4.00025601265952e-06,
+      "clip_ratio/region_mean": 2.651005689813246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15408.0,
+      "completions/mean_length": 7957.671875,
+      "completions/mean_terminated_length": 7685.8544921875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "entropy": 1.1176252663135529,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025940234772861004,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 185606670.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999893844127655,
+      "sampling/importance_sampling_ratio/min": 0.0007622809498570859,
+      "sampling/sampling_logp_difference/max": 7.179195404052734,
+      "sampling/sampling_logp_difference/mean": 0.02338646724820137,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 1.9903963220713194e-05,
+      "clip_ratio/high_mean": 5.829163114867697e-06,
+      "clip_ratio/low_mean": 4.4742550926457625e-05,
+      "clip_ratio/low_min": 3.5803282116830815e-06,
+      "clip_ratio/region_mean": 5.057171370026481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 7060.6640625,
+      "completions/mean_terminated_length": 6759.9111328125,
+      "completions/min_length": 1460.0,
+      "completions/min_terminated_length": 1460.0,
+      "entropy": 0.9148540124297142,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004315398633480072,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 186526883.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0004585353017318994,
+      "sampling/sampling_logp_difference/max": 7.687473297119141,
+      "sampling/sampling_logp_difference/mean": 0.01967843994498253,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 1.147099328591139e-05,
+      "clip_ratio/high_mean": 2.8677483214778476e-06,
+      "clip_ratio/low_mean": 2.8967988555450574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1835736763241584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15596.0,
+      "completions/mean_length": 6649.6640625,
+      "completions/mean_terminated_length": 6416.04052734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9298559054732323,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030786178540438414,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 187397536.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000005841255188,
+      "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07,
+      "sampling/sampling_logp_difference/max": 14.929608345031738,
+      "sampling/sampling_logp_difference/mean": 0.020215414464473724,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 2.2768570943298982e-05,
+      "clip_ratio/high_mean": 5.692142735824746e-06,
+      "clip_ratio/low_mean": 3.249637484259438e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8188517464732286e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 8292.015625,
+      "completions/mean_terminated_length": 7823.8837890625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.8232023045420647,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002438523108139634,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 188477778.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.005636279005557299,
+      "sampling/sampling_logp_difference/max": 5.178531169891357,
+      "sampling/sampling_logp_difference/mean": 0.018984414637088776,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 2.0840709566982696e-05,
+      "clip_ratio/high_mean": 6.135253556749376e-06,
+      "clip_ratio/low_mean": 2.255633432923787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.869158777230041e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15991.0,
+      "completions/mean_length": 7600.9765625,
+      "completions/mean_terminated_length": 6936.71484375,
+      "completions/min_length": 995.0,
+      "completions/min_terminated_length": 995.0,
+      "entropy": 0.8689917623996735,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004773247055709362,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 189470655.0,
+      "reward": 0.40625,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.001327168894931674,
+      "sampling/sampling_logp_difference/max": 6.624707221984863,
+      "sampling/sampling_logp_difference/mean": 0.018666012212634087,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 9.837458947004052e-06,
+      "clip_ratio/high_mean": 2.459364736751013e-06,
+      "clip_ratio/low_mean": 6.463955219260242e-05,
+      "clip_ratio/low_min": 1.0895145351241808e-05,
+      "clip_ratio/region_mean": 6.70989177251613e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16215.0,
+      "completions/mean_length": 7600.34375,
+      "completions/mean_terminated_length": 6855.96630859375,
+      "completions/min_length": 1335.0,
+      "completions/min_terminated_length": 1335.0,
+      "entropy": 0.7636929750442505,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004298723768442869,
+      "learning_rate": 1e-05,
+      "loss": 0.145,
+      "num_tokens": 190462227.0,
+      "reward": 0.515625,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999310374259949,
+      "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05,
+      "sampling/sampling_logp_difference/max": 9.996363639831543,
+      "sampling/sampling_logp_difference/mean": 0.018035393208265305,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 1.4060602325116633e-05,
+      "clip_ratio/high_mean": 3.5151505812791584e-06,
+      "clip_ratio/low_mean": 2.6516039497437305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.003119024924672e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15151.0,
+      "completions/mean_length": 6512.0,
+      "completions/mean_terminated_length": 6434.267578125,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.9043584689497948,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006741553544998169,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 191312483.0,
+      "reward": 0.484375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 1.778468504198827e-05,
+      "sampling/sampling_logp_difference/max": 10.937172889709473,
+      "sampling/sampling_logp_difference/mean": 0.020878732204437256,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 1.7356085209030425e-05,
+      "clip_ratio/high_mean": 4.339021302257606e-06,
+      "clip_ratio/low_mean": 2.8831826739406097e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.317084781429003e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7178.6875,
+      "completions/mean_terminated_length": 6565.00048828125,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.8899475410580635,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00281486171297729,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 192251235.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2240736484527588,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999714493751526,
+      "sampling/importance_sampling_ratio/min": 9.012543159769848e-05,
+      "sampling/sampling_logp_difference/max": 9.314308166503906,
+      "sampling/sampling_logp_difference/mean": 0.020196784287691116,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.5558084214717383e-05,
+      "clip_ratio/high_mean": 3.889521053679346e-06,
+      "clip_ratio/low_mean": 3.0248688972278615e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.413820991227112e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15501.0,
+      "completions/max_terminated_length": 15501.0,
+      "completions/mean_length": 6602.5625,
+      "completions/mean_terminated_length": 6602.5625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.9266818463802338,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005070593673735857,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193116763.0,
+      "reward": 0.53125,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999746680259705,
+      "sampling/importance_sampling_ratio/min": 2.726537559283315e-06,
+      "sampling/sampling_logp_difference/max": 12.812478065490723,
+      "sampling/sampling_logp_difference/mean": 0.020026464015245438,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.188727416476468e-06,
+      "clip_ratio/high_mean": 1.047181854119117e-06,
+      "clip_ratio/low_mean": 2.959152834591805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.063871008635033e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6818.8828125,
+      "completions/mean_terminated_length": 6430.056640625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.874519519507885,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006362155079841614,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 194007868.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009298324585,
+      "sampling/importance_sampling_ratio/min": 0.0005216691642999649,
+      "sampling/sampling_logp_difference/max": 7.55847692489624,
+      "sampling/sampling_logp_difference/mean": 0.01943325623869896,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 9.645911177358357e-06,
+      "clip_ratio/high_mean": 2.4114777943395893e-06,
+      "clip_ratio/low_mean": 6.821557258263056e-05,
+      "clip_ratio/low_min": 1.7265090718865395e-05,
+      "clip_ratio/region_mean": 7.062705049065698e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14536.0,
+      "completions/mean_length": 5515.625,
+      "completions/mean_terminated_length": 5343.111328125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0683523043990135,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003797185141593218,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "num_tokens": 194735980.0,
+      "reward": 0.421875,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 1.137102216830499e-07,
+      "sampling/sampling_logp_difference/max": 15.989612579345703,
+      "sampling/sampling_logp_difference/mean": 0.02120930328965187,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.1971412252241862e-05,
+      "clip_ratio/high_mean": 5.4928530630604655e-06,
+      "clip_ratio/low_mean": 4.9151800567415194e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4644653801005916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 5853.546875,
+      "completions/mean_terminated_length": 5770.6298828125,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.7975900694727898,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004124365746974945,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 195504882.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000672340393066,
+      "sampling/importance_sampling_ratio/min": 0.0032877910416573286,
+      "sampling/sampling_logp_difference/max": 5.717539310455322,
+      "sampling/sampling_logp_difference/mean": 0.017819223925471306,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 7.066538728395244e-06,
+      "clip_ratio/high_mean": 2.843255515472265e-06,
+      "clip_ratio/low_mean": 5.1467116236381116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.431037175185338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6686.25,
+      "completions/mean_terminated_length": 6532.31787109375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.9018580466508865,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024995009880512953,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 196379306.0,
+      "reward": 0.421875,
+      "reward_std": 0.35824593901634216,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999300837516785,
+      "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05,
+      "sampling/sampling_logp_difference/max": 10.818918228149414,
+      "sampling/sampling_logp_difference/mean": 0.018989525735378265,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 6.652828687947476e-06,
+      "clip_ratio/high_mean": 2.5722979444253724e-06,
+      "clip_ratio/low_mean": 3.699686294567073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95691608900961e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16347.0,
+      "completions/mean_length": 7487.3359375,
+      "completions/mean_terminated_length": 7200.3466796875,
+      "completions/min_length": 1222.0,
+      "completions/min_terminated_length": 1222.0,
+      "entropy": 0.9890001565217972,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004295211285352707,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 197357397.0,
+      "reward": 0.40625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0006548459641635418,
+      "sampling/sampling_logp_difference/max": 7.33111047744751,
+      "sampling/sampling_logp_difference/mean": 0.02209121733903885,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 6.0850939007650595e-06,
+      "clip_ratio/high_mean": 1.5212734751912649e-06,
+      "clip_ratio/low_mean": 2.9443070673096372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0964344205131056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7233.484375,
+      "completions/mean_terminated_length": 6938.30615234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.9683803990483284,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003119673579931259,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 198303795.0,
+      "reward": 0.328125,
+      "reward_std": 0.23014704883098602,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000243186950684,
+      "sampling/importance_sampling_ratio/min": 0.020358745008707047,
+      "sampling/sampling_logp_difference/max": 3.89424467086792,
+      "sampling/sampling_logp_difference/mean": 0.021085180342197418,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.963812095113099e-06,
+      "clip_ratio/high_mean": 1.9909530237782747e-06,
+      "clip_ratio/low_mean": 4.031422963635123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.23051826601295e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15733.0,
+      "completions/mean_length": 6457.78125,
+      "completions/mean_terminated_length": 6300.22265625,
+      "completions/min_length": 850.0,
+      "completions/min_terminated_length": 850.0,
+      "entropy": 0.8881053999066353,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033790848683565855,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 199154735.0,
+      "reward": 0.3828125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998799562454224,
+      "sampling/importance_sampling_ratio/min": 2.872048128210736e-07,
+      "sampling/sampling_logp_difference/max": 15.063070297241211,
+      "sampling/sampling_logp_difference/mean": 0.01950821653008461,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 9.059622016138746e-06,
+      "clip_ratio/high_mean": 3.3430123380639998e-06,
+      "clip_ratio/low_mean": 2.2856192117615137e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6199204512522556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 7904.40625,
+      "completions/mean_terminated_length": 7769.81005859375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9881557524204254,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021492803934961557,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 200185643.0,
+      "reward": 0.359375,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001094341278076,
+      "sampling/importance_sampling_ratio/min": 0.001458622980862856,
+      "sampling/sampling_logp_difference/max": 6.530262470245361,
+      "sampling/sampling_logp_difference/mean": 0.021201875060796738,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 6.9962839006620925e-06,
+      "clip_ratio/high_mean": 1.7490709751655231e-06,
+      "clip_ratio/low_mean": 3.018811844412994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193718976035598e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 7414.4921875,
+      "completions/mean_terminated_length": 7414.4921875,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "entropy": 0.9571134969592094,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037221095990389585,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 201153114.0,
+      "reward": 0.4375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999958872795105,
+      "sampling/importance_sampling_ratio/min": 0.0009130563121289015,
+      "sampling/sampling_logp_difference/max": 6.99871301651001,
+      "sampling/sampling_logp_difference/mean": 0.021356744691729546,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 1.1248092050664127e-05,
+      "clip_ratio/high_mean": 2.8120230126660317e-06,
+      "clip_ratio/low_mean": 5.4354991334548686e-05,
+      "clip_ratio/low_min": 6.868132004456129e-06,
+      "clip_ratio/region_mean": 5.716701480196207e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15835.0,
+      "completions/max_terminated_length": 15835.0,
+      "completions/mean_length": 5955.953125,
+      "completions/mean_terminated_length": 5955.953125,
+      "completions/min_length": 1394.0,
+      "completions/min_terminated_length": 1394.0,
+      "entropy": 0.730999618768692,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006285305600613356,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 201933044.0,
+      "reward": 0.59375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.007535050623118877,
+      "sampling/sampling_logp_difference/max": 4.888189792633057,
+      "sampling/sampling_logp_difference/mean": 0.016975615173578262,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 7.226686648209579e-06,
+      "clip_ratio/high_mean": 3.094216481258627e-06,
+      "clip_ratio/low_mean": 4.66828214484849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.977703792974353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6923.3515625,
+      "completions/mean_terminated_length": 6458.0732421875,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9938417226076126,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005667983554303646,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 202837281.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05,
+      "sampling/sampling_logp_difference/max": 10.402952194213867,
+      "sampling/sampling_logp_difference/mean": 0.022059854120016098,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 5.2318769121484365e-06,
+      "clip_ratio/high_mean": 1.3079692280371091e-06,
+      "clip_ratio/low_mean": 4.239228087499214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3700250216716086e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14726.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 5930.9296875,
+      "completions/mean_terminated_length": 5930.9296875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.8100385963916779,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004052883945405483,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 203614448.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999989926815033,
+      "sampling/importance_sampling_ratio/min": 0.00015170808183029294,
+      "sampling/sampling_logp_difference/max": 8.79355239868164,
+      "sampling/sampling_logp_difference/mean": 0.018519222736358643,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 4.905230980511988e-06,
+      "clip_ratio/high_mean": 1.226307745127997e-06,
+      "clip_ratio/low_mean": 5.500513248080097e-05,
+      "clip_ratio/low_min": 7.924934834591113e-06,
+      "clip_ratio/region_mean": 5.6231440112242126e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14996.0,
+      "completions/mean_length": 6911.1015625,
+      "completions/mean_terminated_length": 6108.3134765625,
+      "completions/min_length": 862.0,
+      "completions/min_terminated_length": 862.0,
+      "entropy": 0.9260227829217911,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004494607914239168,
+      "learning_rate": 1e-05,
+      "loss": 0.0269,
+      "num_tokens": 204518261.0,
+      "reward": 0.4140625,
+      "reward_std": 0.34033796191215515,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998886585235596,
+      "sampling/importance_sampling_ratio/min": 0.0015266009140759706,
+      "sampling/sampling_logp_difference/max": 6.484711647033691,
+      "sampling/sampling_logp_difference/mean": 0.020527629181742668,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 8.293764039990492e-06,
+      "clip_ratio/high_mean": 2.073441009997623e-06,
+      "clip_ratio/low_mean": 4.75325257411896e-05,
+      "clip_ratio/low_min": 3.599504680096288e-06,
+      "clip_ratio/region_mean": 4.960596663750039e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14637.0,
+      "completions/mean_length": 6972.921875,
+      "completions/mean_terminated_length": 6823.5400390625,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 1.0095533654093742,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029451537411659956,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 205433843.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05,
+      "sampling/sampling_logp_difference/max": 10.53177547454834,
+      "sampling/sampling_logp_difference/mean": 0.02013089321553707,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 4.163383164268453e-05,
+      "clip_ratio/high_mean": 1.382379150527413e-05,
+      "clip_ratio/low_mean": 3.86000854177837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2423876240936806e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 6706.6640625,
+      "completions/mean_terminated_length": 6313.2763671875,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 0.8647518903017044,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003371767932549119,
+      "learning_rate": 1e-05,
+      "loss": 0.073,
+      "num_tokens": 206310296.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 2.948181463580113e-05,
+      "sampling/sampling_logp_difference/max": 10.431736946105957,
+      "sampling/sampling_logp_difference/mean": 0.019770190119743347,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4946740381892596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4946740381892596e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 6882.609375,
+      "completions/mean_terminated_length": 6415.32763671875,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "entropy": 1.013342760503292,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016336971893906593,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 207210974.0,
+      "reward": 0.359375,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999210834503174,
+      "sampling/importance_sampling_ratio/min": 0.0013267879839986563,
+      "sampling/sampling_logp_difference/max": 6.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.02139991894364357,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.4866403944324702e-05,
+      "clip_ratio/high_mean": 3.7166009860811755e-06,
+      "clip_ratio/low_mean": 3.938925010515959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.310585177336179e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15203.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 6195.7421875,
+      "completions/mean_terminated_length": 6195.7421875,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.8448907434940338,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005036406684666872,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208021893.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955892562866,
+      "sampling/importance_sampling_ratio/min": 0.0040348549373447895,
+      "sampling/sampling_logp_difference/max": 5.512784957885742,
+      "sampling/sampling_logp_difference/mean": 0.018679853528738022,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.1244883353356272e-05,
+      "clip_ratio/high_mean": 2.811220838339068e-06,
+      "clip_ratio/low_mean": 3.422392001084518e-05,
+      "clip_ratio/low_min": 6.451612989621935e-06,
+      "clip_ratio/region_mean": 3.703514119024476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6829.609375,
+      "completions/mean_terminated_length": 6521.40283203125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.8679579794406891,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029643685556948185,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 208912059.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00038063788088038564,
+      "sampling/sampling_logp_difference/max": 7.873661994934082,
+      "sampling/sampling_logp_difference/mean": 0.018488366156816483,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 2.2700600311509334e-05,
+      "clip_ratio/high_mean": 5.675150077877333e-06,
+      "clip_ratio/low_mean": 3.138338854569156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.705853873725573e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14503.0,
+      "completions/max_terminated_length": 14503.0,
+      "completions/mean_length": 5444.4453125,
+      "completions/mean_terminated_length": 5444.4453125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0460086688399315,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035942886024713516,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "num_tokens": 209627804.0,
+      "reward": 0.484375,
+      "reward_std": 0.338498055934906,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997478723526,
+      "sampling/importance_sampling_ratio/min": 0.03179635480046272,
+      "sampling/sampling_logp_difference/max": 3.4484035968780518,
+      "sampling/sampling_logp_difference/mean": 0.020146891474723816,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.477029400120955e-05,
+      "clip_ratio/high_mean": 4.552578502625693e-06,
+      "clip_ratio/low_mean": 5.265122354103369e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.720380158891203e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16244.0,
+      "completions/mean_length": 7657.390625,
+      "completions/mean_terminated_length": 7152.544921875,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 0.9528728649020195,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0044983453117311,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 210630150.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000007152557373,
+      "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05,
+      "sampling/sampling_logp_difference/max": 10.158285140991211,
+      "sampling/sampling_logp_difference/mean": 0.02131088823080063,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 8.607642712377128e-06,
+      "clip_ratio/high_mean": 2.151910678094282e-06,
+      "clip_ratio/low_mean": 2.2759413695894182e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.491132454451872e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7574.3515625,
+      "completions/mean_terminated_length": 7504.984375,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 1.0009776800870895,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006095650140196085,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 211620355.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 0.0013946897815912962,
+      "sampling/sampling_logp_difference/max": 6.575083255767822,
+      "sampling/sampling_logp_difference/mean": 0.021727774292230606,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.764823082339717e-05,
+      "clip_ratio/high_mean": 5.141430960975413e-06,
+      "clip_ratio/low_mean": 5.936152001595474e-05,
+      "clip_ratio/low_min": 9.155588486464694e-06,
+      "clip_ratio/region_mean": 6.450295177273802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14915.0,
+      "completions/mean_length": 7919.6875,
+      "completions/mean_terminated_length": 7716.54443359375,
+      "completions/min_length": 1517.0,
+      "completions/min_terminated_length": 1517.0,
+      "entropy": 1.0405654236674309,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037038614973425865,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 212654747.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381899833679,
+      "sampling/importance_sampling_ratio/min": 0.0057550109922885895,
+      "sampling/sampling_logp_difference/max": 5.157684326171875,
+      "sampling/sampling_logp_difference/mean": 0.022051017731428146,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.265254240934155e-05,
+      "clip_ratio/high_mean": 3.1631356023353874e-06,
+      "clip_ratio/low_mean": 4.716233138424286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.032546687289141e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 8613.4765625,
+      "completions/mean_terminated_length": 7735.0693359375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.890489287674427,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325607368722558,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 213774584.0,
+      "reward": 0.40625,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000060796737671,
+      "sampling/importance_sampling_ratio/min": 1.670176425250247e-05,
+      "sampling/sampling_logp_difference/max": 10.999996185302734,
+      "sampling/sampling_logp_difference/mean": 0.020002499222755432,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.6404605503339553e-05,
+      "clip_ratio/high_mean": 4.101151375834888e-06,
+      "clip_ratio/low_mean": 3.880500707964529e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2906158682853857e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7324.8984375,
+      "completions/mean_terminated_length": 6473.1884765625,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 0.761004202067852,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038265211042016745,
+      "learning_rate": 1e-05,
+      "loss": 0.0717,
+      "num_tokens": 214728371.0,
+      "reward": 0.515625,
+      "reward_std": 0.32719239592552185,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000168085098267,
+      "sampling/importance_sampling_ratio/min": 0.0003049026126973331,
+      "sampling/sampling_logp_difference/max": 8.095518112182617,
+      "sampling/sampling_logp_difference/mean": 0.018367979675531387,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 5.624549885396846e-06,
+      "clip_ratio/high_mean": 1.4061374713492114e-06,
+      "clip_ratio/low_mean": 3.6433707123251224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7839844594600436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14167.0,
+      "completions/max_terminated_length": 14167.0,
+      "completions/mean_length": 6422.0859375,
+      "completions/mean_terminated_length": 6422.0859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.9946094751358032,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002729539293795824,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 215570806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.026308411732316017,
+      "sampling/sampling_logp_difference/max": 3.637866497039795,
+      "sampling/sampling_logp_difference/mean": 0.021903935819864273,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 7.2379848461423535e-06,
+      "clip_ratio/high_mean": 1.8094962115355884e-06,
+      "clip_ratio/low_mean": 3.17277934982485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353728982347093e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15585.0,
+      "completions/mean_length": 6845.2890625,
+      "completions/mean_terminated_length": 6693.88134765625,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8822609707713127,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004974282346665859,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 216465635.0,
+      "reward": 0.5390625,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 8.749838889343664e-05,
+      "sampling/sampling_logp_difference/max": 9.343890190124512,
+      "sampling/sampling_logp_difference/mean": 0.019389234483242035,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.58592818024772e-05,
+      "clip_ratio/high_mean": 3.9648204506193e-06,
+      "clip_ratio/low_mean": 4.096964960353944e-05,
+      "clip_ratio/low_min": 1.7403560605089297e-05,
+      "clip_ratio/region_mean": 4.49344687467601e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 7805.484375,
+      "completions/mean_terminated_length": 7528.7578125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.9977599084377289,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0033159854356199503,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 217485089.0,
+      "reward": 0.421875,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999412298202515,
+      "sampling/importance_sampling_ratio/min": 7.967943383846432e-05,
+      "sampling/sampling_logp_difference/max": 9.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.021925684064626694,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.8265397557115648e-05,
+      "clip_ratio/high_mean": 4.566349389278912e-06,
+      "clip_ratio/low_mean": 4.044636898470344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5012717691861326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15681.0,
+      "completions/mean_length": 7737.5546875,
+      "completions/mean_terminated_length": 7530.04052734375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.8667014688253403,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034952745772898197,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "num_tokens": 218496040.0,
+      "reward": 0.453125,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999128580093384,
+      "sampling/importance_sampling_ratio/min": 6.726370338583365e-05,
+      "sampling/sampling_logp_difference/max": 9.606889724731445,
+      "sampling/sampling_logp_difference/mean": 0.019742710515856743,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 8.244294804171659e-06,
+      "clip_ratio/high_mean": 2.0610737010429148e-06,
+      "clip_ratio/low_mean": 3.204250072030845e-05,
+      "clip_ratio/low_min": 3.323495775475749e-06,
+      "clip_ratio/region_mean": 3.410357436450795e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15858.0,
+      "completions/mean_length": 7365.84375,
+      "completions/mean_terminated_length": 6601.59326171875,
+      "completions/min_length": 744.0,
+      "completions/min_terminated_length": 744.0,
+      "entropy": 0.8151945173740387,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038676802068948746,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 219459140.0,
+      "reward": 0.46875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00023387260443996638,
+      "sampling/sampling_logp_difference/max": 8.360733985900879,
+      "sampling/sampling_logp_difference/mean": 0.018882082775235176,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 6.87833608026267e-06,
+      "clip_ratio/high_mean": 2.9462287329806713e-06,
+      "clip_ratio/low_mean": 5.435333650893881e-05,
+      "clip_ratio/low_min": 5.33937054569833e-06,
+      "clip_ratio/region_mean": 5.729956546929316e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 6448.0078125,
+      "completions/mean_terminated_length": 6369.771484375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9546648040413857,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004310046322643757,
+      "learning_rate": 1e-05,
+      "loss": 0.1082,
+      "num_tokens": 220304605.0,
+      "reward": 0.5703125,
+      "reward_std": 0.35611939430236816,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999396800994873,
+      "sampling/importance_sampling_ratio/min": 0.0001234127557836473,
+      "sampling/sampling_logp_difference/max": 8.99997615814209,
+      "sampling/sampling_logp_difference/mean": 0.020253397524356842,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 6.196094091137638e-06,
+      "clip_ratio/high_mean": 1.5490235227844096e-06,
+      "clip_ratio/low_mean": 2.5416685957679874e-05,
+      "clip_ratio/low_min": 5.5736391004757024e-06,
+      "clip_ratio/region_mean": 2.696570959415112e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 7457.6484375,
+      "completions/mean_terminated_length": 6941.24755859375,
+      "completions/min_length": 604.0,
+      "completions/min_terminated_length": 604.0,
+      "entropy": 0.8182889074087143,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026646999176591635,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 221281968.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2012200653553009,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173283576965,
+      "sampling/importance_sampling_ratio/min": 2.902353571698768e-06,
+      "sampling/sampling_logp_difference/max": 12.749988555908203,
+      "sampling/sampling_logp_difference/mean": 0.019208962097764015,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 1.6189535017474554e-05,
+      "clip_ratio/high_mean": 4.047383754368639e-06,
+      "clip_ratio/low_mean": 3.127787306311802e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.532525670379982e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 8561.109375,
+      "completions/mean_terminated_length": 7969.79052734375,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.9581378549337387,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016026750672608614,
+      "learning_rate": 1e-05,
+      "loss": 0.0131,
+      "num_tokens": 222399046.0,
+      "reward": 0.34375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 1.653693971093162e-06,
+      "sampling/sampling_logp_difference/max": 13.312499046325684,
+      "sampling/sampling_logp_difference/mean": 0.02173236384987831,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 1.4200771602190798e-05,
+      "clip_ratio/high_mean": 4.3255887476334465e-06,
+      "clip_ratio/low_mean": 5.2955770115659107e-05,
+      "clip_ratio/low_min": 3.402656830076012e-06,
+      "clip_ratio/region_mean": 5.7281358749605715e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16239.0,
+      "completions/mean_length": 7152.34375,
+      "completions/mean_terminated_length": 7079.6533203125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9052041247487068,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005460259038954973,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 223335010.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3356297016143799,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999966621398926,
+      "sampling/importance_sampling_ratio/min": 0.010161337442696095,
+      "sampling/sampling_logp_difference/max": 4.589165210723877,
+      "sampling/sampling_logp_difference/mean": 0.01986619457602501,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 1.4350314813782461e-05,
+      "clip_ratio/high_mean": 3.5875787034456152e-06,
+      "clip_ratio/low_mean": 3.81288905373367e-05,
+      "clip_ratio/low_min": 8.099272235995159e-06,
+      "clip_ratio/region_mean": 4.1716469809216505e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 6678.65625,
+      "completions/mean_terminated_length": 6524.603515625,
+      "completions/min_length": 963.0,
+      "completions/min_terminated_length": 963.0,
+      "entropy": 0.9043187350034714,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005933742038905621,
+      "learning_rate": 1e-05,
+      "loss": 0.0966,
+      "num_tokens": 224207006.0,
+      "reward": 0.484375,
+      "reward_std": 0.3316681981086731,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000031590461731,
+      "sampling/importance_sampling_ratio/min": 0.0011734943836927414,
+      "sampling/sampling_logp_difference/max": 6.747769355773926,
+      "sampling/sampling_logp_difference/mean": 0.019827336072921753,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 1.6498819377375185e-05,
+      "clip_ratio/high_mean": 4.124704844343796e-06,
+      "clip_ratio/low_mean": 3.601791678420341e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.014262168539062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6999.0390625,
+      "completions/mean_terminated_length": 6850.07177734375,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8109970837831497,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003635740838944912,
+      "learning_rate": 1e-05,
+      "loss": 0.104,
+      "num_tokens": 225122891.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999303817749023,
+      "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05,
+      "sampling/sampling_logp_difference/max": 10.987512588500977,
+      "sampling/sampling_logp_difference/mean": 0.018912551924586296,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 9.527577958579059e-06,
+      "clip_ratio/high_mean": 2.3818944896447647e-06,
+      "clip_ratio/low_mean": 3.766565987461945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.004755419373396e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15713.0,
+      "completions/mean_length": 7483.7109375,
+      "completions/mean_terminated_length": 7045.9912109375,
+      "completions/min_length": 1153.0,
+      "completions/min_terminated_length": 1153.0,
+      "entropy": 0.9473970532417297,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003405241761356592,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 226102462.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00002920627594,
+      "sampling/importance_sampling_ratio/min": 0.00525119062513113,
+      "sampling/sampling_logp_difference/max": 5.249300479888916,
+      "sampling/sampling_logp_difference/mean": 0.021076779812574387,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.5867321963014547e-05,
+      "clip_ratio/high_mean": 3.966830490753637e-06,
+      "clip_ratio/low_mean": 3.8259706570897833e-05,
+      "clip_ratio/low_min": 3.549019083948224e-06,
+      "clip_ratio/region_mean": 4.2226537743772496e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 7569.03125,
+      "completions/mean_terminated_length": 7357.47216796875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9231455475091934,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025927501264959574,
+      "learning_rate": 1e-05,
+      "loss": 0.0801,
+      "num_tokens": 227093562.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19097033143043518,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0052477638237178326,
+      "sampling/sampling_logp_difference/max": 5.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.020578444004058838,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.344091060673236e-05,
+      "clip_ratio/high_mean": 3.36022765168309e-06,
+      "clip_ratio/low_mean": 4.253613235505327e-05,
+      "clip_ratio/low_min": 3.5579084851633525e-06,
+      "clip_ratio/region_mean": 4.5896360120423196e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 7589.2734375,
+      "completions/mean_terminated_length": 7378.2001953125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9265239909291267,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030512227676808834,
+      "learning_rate": 1e-05,
+      "loss": 0.04,
+      "num_tokens": 228086405.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0002165911573683843,
+      "sampling/sampling_logp_difference/max": 8.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.020208362489938736,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.9613525410022703e-05,
+      "clip_ratio/high_mean": 4.903381352505676e-06,
+      "clip_ratio/low_mean": 3.184792547017423e-05,
+      "clip_ratio/low_min": 7.29296516510658e-06,
+      "clip_ratio/region_mean": 3.675130722058384e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 8420.6875,
+      "completions/mean_terminated_length": 8096.97509765625,
+      "completions/min_length": 1114.0,
+      "completions/min_terminated_length": 1114.0,
+      "entropy": 0.9572964608669281,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022430522367358208,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 229183765.0,
+      "reward": 0.34375,
+      "reward_std": 0.309583842754364,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 0.00029693738906644285,
+      "sampling/sampling_logp_difference/max": 8.121989250183105,
+      "sampling/sampling_logp_difference/mean": 0.021570362150669098,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.728750577167375e-06,
+      "clip_ratio/high_mean": 1.6821876442918438e-06,
+      "clip_ratio/low_mean": 2.1682553096979973e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.336474062758498e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15736.0,
+      "completions/mean_length": 6809.765625,
+      "completions/mean_terminated_length": 6579.984375,
+      "completions/min_length": 860.0,
+      "completions/min_terminated_length": 860.0,
+      "entropy": 0.884086549282074,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004295065999031067,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 230077607.0,
+      "reward": 0.484375,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00754612497985363,
+      "sampling/sampling_logp_difference/max": 4.886721134185791,
+      "sampling/sampling_logp_difference/mean": 0.019895706325769424,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 2.8609347509700456e-05,
+      "clip_ratio/high_mean": 7.152336877425114e-06,
+      "clip_ratio/low_mean": 5.158006410965754e-05,
+      "clip_ratio/low_min": 5.210069957684027e-06,
+      "clip_ratio/region_mean": 5.873240070286556e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15080.0,
+      "completions/mean_length": 7340.6953125,
+      "completions/mean_terminated_length": 6973.0810546875,
+      "completions/min_length": 1616.0,
+      "completions/min_terminated_length": 1616.0,
+      "entropy": 0.9920620769262314,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004631794057786465,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 231035616.0,
+      "reward": 0.4375,
+      "reward_std": 0.3235401213169098,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337792396545,
+      "sampling/importance_sampling_ratio/min": 0.0002508950710762292,
+      "sampling/sampling_logp_difference/max": 8.290475845336914,
+      "sampling/sampling_logp_difference/mean": 0.020591016858816147,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.3085940774290066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3085940774290066e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14120.0,
+      "completions/mean_length": 6748.875,
+      "completions/mean_terminated_length": 6595.93701171875,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.9867061004042625,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035752104595303535,
+      "learning_rate": 1e-05,
+      "loss": 0.0455,
+      "num_tokens": 231920056.0,
+      "reward": 0.40625,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999653100967407,
+      "sampling/importance_sampling_ratio/min": 0.0003869794018100947,
+      "sampling/sampling_logp_difference/max": 7.8571391105651855,
+      "sampling/sampling_logp_difference/mean": 0.02061416581273079,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 1.2506750408647349e-05,
+      "clip_ratio/high_mean": 3.1266876021618373e-06,
+      "clip_ratio/low_mean": 3.10397430212106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.416643085074611e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 7260.3046875,
+      "completions/mean_terminated_length": 7188.46435546875,
+      "completions/min_length": 1384.0,
+      "completions/min_terminated_length": 1384.0,
+      "entropy": 1.0388494208455086,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036644963547587395,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 232869159.0,
+      "reward": 0.390625,
+      "reward_std": 0.2359209954738617,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999546408653259,
+      "sampling/importance_sampling_ratio/min": 0.0008660226594656706,
+      "sampling/sampling_logp_difference/max": 7.051599502563477,
+      "sampling/sampling_logp_difference/mean": 0.02120530977845192,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.704355301830219e-05,
+      "clip_ratio/high_mean": 6.760888254575548e-06,
+      "clip_ratio/low_mean": 3.1861192269388994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.862208097871189e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16073.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 6354.4609375,
+      "completions/mean_terminated_length": 6354.4609375,
+      "completions/min_length": 1035.0,
+      "completions/min_terminated_length": 1035.0,
+      "entropy": 0.8405331820249557,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004709267523139715,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 233702842.0,
+      "reward": 0.546875,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 0.0046309432946145535,
+      "sampling/sampling_logp_difference/max": 5.37499475479126,
+      "sampling/sampling_logp_difference/mean": 0.019126038998365402,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 9.749228638611385e-06,
+      "clip_ratio/high_mean": 2.437307159652846e-06,
+      "clip_ratio/low_mean": 3.855073941849696e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.098804652130639e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16026.0,
+      "completions/mean_length": 6514.578125,
+      "completions/mean_terminated_length": 6357.9208984375,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "entropy": 1.0254098922014236,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003066045930609107,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 234556348.0,
+      "reward": 0.4375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805092811584,
+      "sampling/importance_sampling_ratio/min": 0.005210204049944878,
+      "sampling/sampling_logp_difference/max": 5.257136344909668,
+      "sampling/sampling_logp_difference/mean": 0.019960148259997368,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.0475813724042382e-05,
+      "clip_ratio/high_mean": 2.6189534310105955e-06,
+      "clip_ratio/low_mean": 3.487835761006863e-05,
+      "clip_ratio/low_min": 2.9392399483185727e-06,
+      "clip_ratio/region_mean": 3.749731081370555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 7379.5546875,
+      "completions/mean_terminated_length": 7236.62744140625,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 1.0397320613265038,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005132520105689764,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 235521091.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519364118576,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999256134033203,
+      "sampling/importance_sampling_ratio/min": 0.00016659013635944575,
+      "sampling/sampling_logp_difference/max": 8.699974060058594,
+      "sampling/sampling_logp_difference/mean": 0.021417103707790375,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.9904123973901733e-05,
+      "clip_ratio/high_mean": 5.776861314643611e-06,
+      "clip_ratio/low_mean": 2.6659268655748747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2436129686175263e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14565.0,
+      "completions/mean_length": 7837.1640625,
+      "completions/mean_terminated_length": 7632.04052734375,
+      "completions/min_length": 1346.0,
+      "completions/min_terminated_length": 1346.0,
+      "entropy": 0.8400963917374611,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028969801496714354,
+      "learning_rate": 1e-05,
+      "loss": 0.0143,
+      "num_tokens": 236544160.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887943267822,
+      "sampling/importance_sampling_ratio/min": 2.883308241052873e-07,
+      "sampling/sampling_logp_difference/max": 15.059157371520996,
+      "sampling/sampling_logp_difference/mean": 0.019267702475190163,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 8.562770290154731e-06,
+      "clip_ratio/high_mean": 2.1406925725386827e-06,
+      "clip_ratio/low_mean": 4.060094340729847e-05,
+      "clip_ratio/low_min": 3.8700886761944275e-06,
+      "clip_ratio/region_mean": 4.2741635979837156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15350.0,
+      "completions/mean_length": 6696.3515625,
+      "completions/mean_terminated_length": 6542.57958984375,
+      "completions/min_length": 1239.0,
+      "completions/min_terminated_length": 1239.0,
+      "entropy": 0.8495818004012108,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003412836929783225,
+      "learning_rate": 1e-05,
+      "loss": 0.0803,
+      "num_tokens": 237423101.0,
+      "reward": 0.515625,
+      "reward_std": 0.37981897592544556,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.012152798473834991,
+      "sampling/sampling_logp_difference/max": 4.410195827484131,
+      "sampling/sampling_logp_difference/mean": 0.018458625301718712,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 1.1463653436294408e-05,
+      "clip_ratio/high_mean": 3.646129641765583e-06,
+      "clip_ratio/low_mean": 6.144847083078275e-05,
+      "clip_ratio/low_min": 1.110105540647055e-05,
+      "clip_ratio/region_mean": 6.509460160941671e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15666.0,
+      "completions/mean_length": 7700.3671875,
+      "completions/mean_terminated_length": 7121.45849609375,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "entropy": 0.8258870914578438,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024443145375698805,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 238429956.0,
+      "reward": 0.375,
+      "reward_std": 0.2872493863105774,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999113082885742,
+      "sampling/importance_sampling_ratio/min": 0.00026112530031241477,
+      "sampling/sampling_logp_difference/max": 8.250510215759277,
+      "sampling/sampling_logp_difference/mean": 0.019427984952926636,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 4.218127742205979e-06,
+      "clip_ratio/high_mean": 1.0545319355514948e-06,
+      "clip_ratio/low_mean": 1.7289162997258245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.834369493280974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16112.0,
+      "completions/mean_length": 6255.21875,
+      "completions/mean_terminated_length": 6094.44482421875,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.8179014846682549,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022747826296836138,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 239250160.0,
+      "reward": 0.5234375,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.0002633975527714938,
+      "sampling/sampling_logp_difference/max": 8.241846084594727,
+      "sampling/sampling_logp_difference/mean": 0.018723051995038986,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 1.698448841125355e-05,
+      "clip_ratio/high_mean": 5.369374321162468e-06,
+      "clip_ratio/low_mean": 6.14647315160255e-05,
+      "clip_ratio/low_min": 5.043576493335422e-06,
+      "clip_ratio/region_mean": 6.683410583718796e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15321.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6914.9609375,
+      "completions/mean_terminated_length": 6914.9609375,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9700981751084328,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005685295443981886,
+      "learning_rate": 1e-05,
+      "loss": -0.0056,
+      "num_tokens": 240156211.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998887777328491,
+      "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05,
+      "sampling/sampling_logp_difference/max": 9.997581481933594,
+      "sampling/sampling_logp_difference/mean": 0.021195171400904655,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9186837764427764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9186837764427764e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15469.0,
+      "completions/mean_length": 5227.53125,
+      "completions/mean_terminated_length": 5139.68505859375,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.9116031974554062,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003880272386595607,
+      "learning_rate": 1e-05,
+      "loss": 0.1246,
+      "num_tokens": 240845295.0,
+      "reward": 0.6328125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000362396240234,
+      "sampling/importance_sampling_ratio/min": 0.00012422871077433228,
+      "sampling/sampling_logp_difference/max": 8.993386268615723,
+      "sampling/sampling_logp_difference/mean": 0.018801718950271606,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 2.5015486926349695e-05,
+      "clip_ratio/high_mean": 8.084949570275057e-06,
+      "clip_ratio/low_mean": 5.524710468307603e-05,
+      "clip_ratio/low_min": 3.776891389861703e-06,
+      "clip_ratio/region_mean": 6.333205465125502e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 8065.4765625,
+      "completions/mean_terminated_length": 7510.90869140625,
+      "completions/min_length": 1055.0,
+      "completions/min_terminated_length": 1055.0,
+      "entropy": 0.7446574792265892,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028986844699829817,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 241895676.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3474721610546112,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999842643737793,
+      "sampling/importance_sampling_ratio/min": 0.0017039099475368857,
+      "sampling/sampling_logp_difference/max": 6.3748297691345215,
+      "sampling/sampling_logp_difference/mean": 0.01853121444582939,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 9.486341014053323e-06,
+      "clip_ratio/high_mean": 2.371585253513331e-06,
+      "clip_ratio/low_mean": 2.896106741445692e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.133265261112683e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15534.0,
+      "completions/max_terminated_length": 15534.0,
+      "completions/mean_length": 6127.359375,
+      "completions/mean_terminated_length": 6127.359375,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.8569132760167122,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003845847910270095,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 242698258.0,
+      "reward": 0.53125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000942945480347,
+      "sampling/importance_sampling_ratio/min": 0.00043231461313553154,
+      "sampling/sampling_logp_difference/max": 7.746356964111328,
+      "sampling/sampling_logp_difference/mean": 0.01856958493590355,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 2.9848330086679198e-05,
+      "clip_ratio/high_mean": 7.4620825216697995e-06,
+      "clip_ratio/low_mean": 4.3558867673709756e-05,
+      "clip_ratio/low_min": 4.417741820361698e-06,
+      "clip_ratio/region_mean": 5.1020949285884853e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15192.0,
+      "completions/mean_length": 6600.1484375,
+      "completions/mean_terminated_length": 6365.33642578125,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.78924310952425,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003953634761273861,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 243560957.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.0006525487406179309,
+      "sampling/sampling_logp_difference/max": 7.334624767303467,
+      "sampling/sampling_logp_difference/mean": 0.018097909167408943,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 6.635561703660642e-06,
+      "clip_ratio/high_mean": 1.6588904259151604e-06,
+      "clip_ratio/low_mean": 2.737523408313791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9034124281679397e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7852.171875,
+      "completions/mean_terminated_length": 7852.171875,
+      "completions/min_length": 1276.0,
+      "completions/min_terminated_length": 1276.0,
+      "entropy": 1.0598893761634827,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00360781978815794,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 244585923.0,
+      "reward": 0.3125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05,
+      "sampling/sampling_logp_difference/max": 10.076086044311523,
+      "sampling/sampling_logp_difference/mean": 0.022330068051815033,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 3.1540168947685743e-06,
+      "clip_ratio/high_mean": 7.885042236921436e-07,
+      "clip_ratio/low_mean": 4.7973388973332476e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.876189268543385e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7972.2265625,
+      "completions/mean_terminated_length": 7700.87890625,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.933217465877533,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0027661293279379606,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 245628064.0,
+      "reward": 0.28125,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05,
+      "sampling/sampling_logp_difference/max": 10.366576194763184,
+      "sampling/sampling_logp_difference/mean": 0.021125148981809616,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.2965969062861404e-05,
+      "clip_ratio/high_mean": 3.241492265715351e-06,
+      "clip_ratio/low_mean": 4.6317693090713874e-05,
+      "clip_ratio/low_min": 3.820877282123547e-06,
+      "clip_ratio/region_mean": 4.955918507221213e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15744.0,
+      "completions/mean_length": 7135.6953125,
+      "completions/mean_terminated_length": 6913.736328125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 0.7786942347884178,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005680318456143141,
+      "learning_rate": 1e-05,
+      "loss": 0.0786,
+      "num_tokens": 246561329.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999462366104126,
+      "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05,
+      "sampling/sampling_logp_difference/max": 9.737424850463867,
+      "sampling/sampling_logp_difference/mean": 0.018504241481423378,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.22437145175536e-05,
+      "clip_ratio/low_min": 1.4025082009538892e-05,
+      "clip_ratio/region_mean": 4.22437145175536e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 6704.046875,
+      "completions/mean_terminated_length": 6627.82666015625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "entropy": 1.0435140281915665,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026402862276881933,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 247437415.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31276631355285645,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998904466629028,
+      "sampling/importance_sampling_ratio/min": 0.0007800163584761322,
+      "sampling/sampling_logp_difference/max": 7.156195640563965,
+      "sampling/sampling_logp_difference/mean": 0.02134273201227188,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 2.223430897174694e-05,
+      "clip_ratio/high_mean": 6.8746438159905665e-06,
+      "clip_ratio/low_mean": 4.7084630978133646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3959275192028144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 5892.5078125,
+      "completions/mean_terminated_length": 5725.9765625,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "entropy": 0.8004944771528244,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003993614576756954,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 248211112.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0024652592837810516,
+      "sampling/sampling_logp_difference/max": 6.005458354949951,
+      "sampling/sampling_logp_difference/mean": 0.01924925297498703,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 2.1833082200828358e-05,
+      "clip_ratio/high_mean": 5.458270550207089e-06,
+      "clip_ratio/low_mean": 3.415995615796419e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961822596920683e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 7812.140625,
+      "completions/mean_terminated_length": 7316.24755859375,
+      "completions/min_length": 1515.0,
+      "completions/min_terminated_length": 1515.0,
+      "entropy": 0.8841542899608612,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001573400106281042,
+      "learning_rate": 1e-05,
+      "loss": 0.0823,
+      "num_tokens": 249228106.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 0.001001527882181108,
+      "sampling/sampling_logp_difference/max": 6.906228542327881,
+      "sampling/sampling_logp_difference/mean": 0.01956877112388611,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 1.014439021673752e-05,
+      "clip_ratio/high_mean": 2.53609755418438e-06,
+      "clip_ratio/low_mean": 3.068193461785995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.321803217204433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 6372.953125,
+      "completions/mean_terminated_length": 6132.6884765625,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.8228401988744736,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021125099156051874,
+      "learning_rate": 1e-05,
+      "loss": 0.0438,
+      "num_tokens": 250063284.0,
+      "reward": 0.5,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05,
+      "sampling/sampling_logp_difference/max": 9.937475204467773,
+      "sampling/sampling_logp_difference/mean": 0.01943521574139595,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 7.023906164249638e-06,
+      "clip_ratio/high_mean": 1.7559765410624095e-06,
+      "clip_ratio/low_mean": 2.526416994896863e-05,
+      "clip_ratio/low_min": 6.7760895490209805e-06,
+      "clip_ratio/region_mean": 2.7020146660561295e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16270.0,
+      "completions/mean_length": 7817.8671875,
+      "completions/mean_terminated_length": 7396.58154296875,
+      "completions/min_length": 1568.0,
+      "completions/min_terminated_length": 1568.0,
+      "entropy": 0.9454319775104523,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022315154783427715,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 251085123.0,
+      "reward": 0.40625,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06,
+      "sampling/sampling_logp_difference/max": 12.760490417480469,
+      "sampling/sampling_logp_difference/mean": 0.021764669567346573,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 1.4797966287005693e-05,
+      "clip_ratio/high_mean": 3.699491571751423e-06,
+      "clip_ratio/low_mean": 4.36271948274225e-05,
+      "clip_ratio/low_min": 3.6957101201551268e-06,
+      "clip_ratio/region_mean": 4.732668639917392e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 7168.4921875,
+      "completions/mean_terminated_length": 6635.36328125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8433891162276268,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 252020906.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589920043945,
+      "sampling/importance_sampling_ratio/min": 0.0003851866349577904,
+      "sampling/sampling_logp_difference/max": 7.861782550811768,
+      "sampling/sampling_logp_difference/mean": 0.01929781585931778,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 1.996871560550062e-05,
+      "clip_ratio/high_mean": 6.089093403716106e-06,
+      "clip_ratio/low_mean": 4.2792244585143635e-05,
+      "clip_ratio/low_min": 1.0337215371691855e-05,
+      "clip_ratio/region_mean": 4.8881338216233416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7322.5078125,
+      "completions/mean_terminated_length": 6876.8603515625,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 0.9157031401991844,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036942458245903254,
+      "learning_rate": 1e-05,
+      "loss": 0.079,
+      "num_tokens": 252977435.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24275577068328857,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999804496765137,
+      "sampling/importance_sampling_ratio/min": 0.00029605376766994596,
+      "sampling/sampling_logp_difference/max": 8.124969482421875,
+      "sampling/sampling_logp_difference/mean": 0.0205365102738142,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.631919460327481e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.631919460327481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16078.0,
+      "completions/mean_length": 7025.484375,
+      "completions/mean_terminated_length": 6723.5966796875,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 1.1329731941223145,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034127074759453535,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 253896161.0,
+      "reward": 0.25,
+      "reward_std": 0.27722424268722534,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0005197672289796174,
+      "sampling/sampling_logp_difference/max": 7.562129497528076,
+      "sampling/sampling_logp_difference/mean": 0.023741140961647034,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 4.368643658381188e-06,
+      "clip_ratio/high_mean": 1.092160914595297e-06,
+      "clip_ratio/low_mean": 2.4661783299961826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5753944555617636e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13776.0,
+      "completions/mean_length": 5996.1796875,
+      "completions/mean_terminated_length": 5661.08837890625,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8773328885436058,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003959407564252615,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 254690264.0,
+      "reward": 0.53125,
+      "reward_std": 0.26645541191101074,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07,
+      "sampling/sampling_logp_difference/max": 15.73043155670166,
+      "sampling/sampling_logp_difference/mean": 0.018407585099339485,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.616483677935321e-05,
+      "clip_ratio/high_mean": 4.041209194838302e-06,
+      "clip_ratio/low_mean": 3.736187466074625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140308453770558e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16383.0,
+      "completions/mean_length": 7165.328125,
+      "completions/mean_terminated_length": 6867.951171875,
+      "completions/min_length": 1115.0,
+      "completions/min_terminated_length": 1115.0,
+      "entropy": 0.9502597972750664,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030910037457942963,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 255626394.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000731945037842,
+      "sampling/importance_sampling_ratio/min": 0.00022311302018351853,
+      "sampling/sampling_logp_difference/max": 8.407832145690918,
+      "sampling/sampling_logp_difference/mean": 0.020668907091021538,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.1702686606440693e-05,
+      "clip_ratio/high_mean": 2.9256716516101733e-06,
+      "clip_ratio/low_mean": 5.5247357522603124e-05,
+      "clip_ratio/low_min": 3.6811261452385224e-06,
+      "clip_ratio/region_mean": 5.8173028264718596e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15375.0,
+      "completions/mean_length": 8001.9296875,
+      "completions/mean_terminated_length": 7661.34912109375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "entropy": 0.8591345250606537,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037233952898532152,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 256673457.0,
+      "reward": 0.421875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999151229858398,
+      "sampling/importance_sampling_ratio/min": 0.0021876997780054808,
+      "sampling/sampling_logp_difference/max": 6.124904632568359,
+      "sampling/sampling_logp_difference/mean": 0.020540472120046616,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 3.721341136042611e-05,
+      "clip_ratio/high_mean": 1.2759249216287571e-05,
+      "clip_ratio/low_mean": 3.570647322703735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.846572301175911e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 6924.84375,
+      "completions/mean_terminated_length": 6697.82421875,
+      "completions/min_length": 803.0,
+      "completions/min_terminated_length": 803.0,
+      "entropy": 0.7969356626272202,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006054217461496592,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 257578501.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.007889713160693645,
+      "sampling/sampling_logp_difference/max": 4.842195510864258,
+      "sampling/sampling_logp_difference/mean": 0.019306108355522156,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.0211543894911301e-05,
+      "clip_ratio/high_mean": 2.5528859737278253e-06,
+      "clip_ratio/low_mean": 5.2388056587915344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4940942732173426e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14439.0,
+      "completions/mean_length": 6203.03125,
+      "completions/mean_terminated_length": 5958.6884765625,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "entropy": 0.8734413683414459,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004903806839138269,
+      "learning_rate": 1e-05,
+      "loss": 0.0689,
+      "num_tokens": 258392625.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999826550483704,
+      "sampling/importance_sampling_ratio/min": 0.00020370795391499996,
+      "sampling/sampling_logp_difference/max": 8.498823165893555,
+      "sampling/sampling_logp_difference/mean": 0.01909301057457924,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.5135058674786706e-05,
+      "clip_ratio/high_mean": 4.64845766146027e-06,
+      "clip_ratio/low_mean": 4.373456977191381e-05,
+      "clip_ratio/low_min": 3.670856358439778e-06,
+      "clip_ratio/region_mean": 4.8383026296505705e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 7982.5390625,
+      "completions/mean_terminated_length": 7641.01611328125,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0091779381036758,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033637424930930138,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 259435270.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999765753746033,
+      "sampling/importance_sampling_ratio/min": 0.0016514655435457826,
+      "sampling/sampling_logp_difference/max": 6.406092166900635,
+      "sampling/sampling_logp_difference/mean": 0.02182736061513424,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 2.3964702677403693e-05,
+      "clip_ratio/high_mean": 5.991175669350923e-06,
+      "clip_ratio/low_mean": 5.2442986770984135e-05,
+      "clip_ratio/low_min": 8.75736759553547e-06,
+      "clip_ratio/region_mean": 5.843416238349164e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6915.3125,
+      "completions/mean_terminated_length": 6688.064453125,
+      "completions/min_length": 778.0,
+      "completions/min_terminated_length": 778.0,
+      "entropy": 0.7964543774724007,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052203768864274025,
+      "learning_rate": 1e-05,
+      "loss": 0.144,
+      "num_tokens": 260337614.0,
+      "reward": 0.46875,
+      "reward_std": 0.37928223609924316,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 7.032832218101248e-05,
+      "sampling/sampling_logp_difference/max": 9.562335968017578,
+      "sampling/sampling_logp_difference/mean": 0.017896221950650215,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 4.458271632756805e-05,
+      "clip_ratio/high_mean": 1.1145679081892013e-05,
+      "clip_ratio/low_mean": 6.243192206056847e-05,
+      "clip_ratio/low_min": 1.2397775662975619e-05,
+      "clip_ratio/region_mean": 7.357759886872373e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7029.4375,
+      "completions/mean_terminated_length": 6880.95263671875,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "entropy": 0.8605096861720085,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005570738110691309,
+      "learning_rate": 1e-05,
+      "loss": 0.0984,
+      "num_tokens": 261254070.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3327290117740631,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999494552612305,
+      "sampling/importance_sampling_ratio/min": 0.0009070249507203698,
+      "sampling/sampling_logp_difference/max": 7.005340576171875,
+      "sampling/sampling_logp_difference/mean": 0.01905740052461624,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 3.390461233720998e-05,
+      "clip_ratio/high_mean": 1.1191766247975465e-05,
+      "clip_ratio/low_mean": 7.46641262594494e-05,
+      "clip_ratio/low_min": 5.041745680500753e-06,
+      "clip_ratio/region_mean": 8.585589102949598e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5858.84375,
+      "completions/mean_terminated_length": 5606.240234375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.8430554121732712,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004496110137552023,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 262024906.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294877052307,
+      "sampling/importance_sampling_ratio/min": 0.00040469475788995624,
+      "sampling/sampling_logp_difference/max": 7.812377452850342,
+      "sampling/sampling_logp_difference/mean": 0.019225869327783585,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 3.2563955301156966e-06,
+      "clip_ratio/high_mean": 8.140988825289242e-07,
+      "clip_ratio/low_mean": 3.7080020149460324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.789411886145899e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15976.0,
+      "completions/mean_length": 8337.328125,
+      "completions/mean_terminated_length": 7728.7568359375,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.901745393872261,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00348713924176991,
+      "learning_rate": 1e-05,
+      "loss": -0.0002,
+      "num_tokens": 263110844.0,
+      "reward": 0.296875,
+      "reward_std": 0.20805485546588898,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998900890350342,
+      "sampling/importance_sampling_ratio/min": 0.0022652465850114822,
+      "sampling/sampling_logp_difference/max": 6.090071678161621,
+      "sampling/sampling_logp_difference/mean": 0.02157524600625038,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 2.3739744847262045e-05,
+      "clip_ratio/high_mean": 5.934936211815511e-06,
+      "clip_ratio/low_mean": 2.823553325015382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.417046866616147e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 7084.7265625,
+      "completions/mean_terminated_length": 6381.42041015625,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8265534415841103,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003980033565312624,
+      "learning_rate": 1e-05,
+      "loss": 0.0551,
+      "num_tokens": 264036169.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673366546631,
+      "sampling/importance_sampling_ratio/min": 0.00012345099821686745,
+      "sampling/sampling_logp_difference/max": 8.999666213989258,
+      "sampling/sampling_logp_difference/mean": 0.018782664090394974,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 1.1745505617000163e-05,
+      "clip_ratio/high_mean": 3.771558226617344e-06,
+      "clip_ratio/low_mean": 6.913120819262986e-05,
+      "clip_ratio/low_min": 2.494283216947224e-05,
+      "clip_ratio/region_mean": 7.290276607818669e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6543.796875,
+      "completions/mean_terminated_length": 6543.796875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8899869695305824,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.006467343773692846,
+      "learning_rate": 1e-05,
+      "loss": 0.1139,
+      "num_tokens": 264892767.0,
+      "reward": 0.484375,
+      "reward_std": 0.3934885561466217,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000489950180054,
+      "sampling/importance_sampling_ratio/min": 9.891482477542013e-05,
+      "sampling/sampling_logp_difference/max": 9.221251487731934,
+      "sampling/sampling_logp_difference/mean": 0.02032080665230751,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.395576979732141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.395576979732141e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16307.0,
+      "completions/mean_length": 8483.390625,
+      "completions/mean_terminated_length": 7813.84765625,
+      "completions/min_length": 1342.0,
+      "completions/min_terminated_length": 1342.0,
+      "entropy": 0.9621479511260986,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003174177836626768,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 265995697.0,
+      "reward": 0.3359375,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.0005628522485494614,
+      "sampling/sampling_logp_difference/max": 7.4824934005737305,
+      "sampling/sampling_logp_difference/mean": 0.02145479805767536,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.2596524811669951e-05,
+      "clip_ratio/high_mean": 3.149131202917488e-06,
+      "clip_ratio/low_mean": 3.7911659774181317e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.106079018129094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14985.0,
+      "completions/mean_length": 7184.578125,
+      "completions/mean_terminated_length": 6963.79248046875,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.9993807673454285,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003356153378263116,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 266937707.0,
+      "reward": 0.3828125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000238418579102,
+      "sampling/importance_sampling_ratio/min": 0.0017036627978086472,
+      "sampling/sampling_logp_difference/max": 6.374974727630615,
+      "sampling/sampling_logp_difference/mean": 0.02204768732190132,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 1.9245163684900035e-05,
+      "clip_ratio/high_mean": 4.811290921225009e-06,
+      "clip_ratio/low_mean": 4.8845648166206956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.365693925796222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16216.0,
+      "completions/mean_length": 7029.2265625,
+      "completions/mean_terminated_length": 6727.45947265625,
+      "completions/min_length": 851.0,
+      "completions/min_terminated_length": 851.0,
+      "entropy": 0.9139953926205635,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006375293247401714,
+      "learning_rate": 1e-05,
+      "loss": 0.0519,
+      "num_tokens": 267853880.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.010649868287146091,
+      "sampling/sampling_logp_difference/max": 4.542207717895508,
+      "sampling/sampling_logp_difference/mean": 0.020365029573440552,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 4.812504812434781e-06,
+      "clip_ratio/high_mean": 1.2031262031086953e-06,
+      "clip_ratio/low_mean": 2.5999243803198624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.720237000630732e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6188.0078125,
+      "completions/mean_terminated_length": 5943.30419921875,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.7640773430466652,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003697809297591448,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 268665721.0,
+      "reward": 0.5078125,
+      "reward_std": 0.20699402689933777,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372363090515,
+      "sampling/importance_sampling_ratio/min": 0.02927250787615776,
+      "sampling/sampling_logp_difference/max": 3.531106472015381,
+      "sampling/sampling_logp_difference/mean": 0.016581017524003983,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1358927824621787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1358927824621787e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 8128.21875,
+      "completions/mean_terminated_length": 7861.90283203125,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.8218234181404114,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002286596456542611,
+      "learning_rate": 1e-05,
+      "loss": 0.0763,
+      "num_tokens": 269726181.0,
+      "reward": 0.375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999798536300659,
+      "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06,
+      "sampling/sampling_logp_difference/max": 12.90043830871582,
+      "sampling/sampling_logp_difference/mean": 0.019403984770178795,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 1.4808477317274082e-05,
+      "clip_ratio/high_mean": 3.7021193293185206e-06,
+      "clip_ratio/low_mean": 3.0363167581981543e-05,
+      "clip_ratio/low_min": 6.364238288369961e-06,
+      "clip_ratio/region_mean": 3.4065286854456645e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 5673.3359375,
+      "completions/mean_terminated_length": 5503.32568359375,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.9275510385632515,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00485506234690547,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 270470616.0,
+      "reward": 0.4921875,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.0009123464697040617,
+      "sampling/sampling_logp_difference/max": 6.999490737915039,
+      "sampling/sampling_logp_difference/mean": 0.01881871558725834,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 1.1274602456978755e-05,
+      "clip_ratio/high_mean": 3.6739949109687586e-06,
+      "clip_ratio/low_mean": 3.968570712231667e-05,
+      "clip_ratio/low_min": 3.4213767321489286e-06,
+      "clip_ratio/region_mean": 4.335970191959859e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 6944.8984375,
+      "completions/mean_terminated_length": 6795.07177734375,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9335741624236107,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005874342750757933,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 271377723.0,
+      "reward": 0.390625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000594854354858,
+      "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05,
+      "sampling/sampling_logp_difference/max": 10.049861907958984,
+      "sampling/sampling_logp_difference/mean": 0.020590776577591896,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 1.264126694877632e-05,
+      "clip_ratio/high_mean": 3.16031673719408e-06,
+      "clip_ratio/low_mean": 3.206376845810155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.522408474054828e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 7705.625,
+      "completions/mean_terminated_length": 7278.8193359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.8491624072194099,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001684082904830575,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 272384891.0,
+      "reward": 0.390625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 6.605865200981498e-05,
+      "sampling/sampling_logp_difference/max": 9.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.020136822015047073,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 9.772357770998497e-06,
+      "clip_ratio/high_mean": 2.443089442749624e-06,
+      "clip_ratio/low_mean": 3.8573590472879005e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.101667946088128e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6611.1484375,
+      "completions/mean_terminated_length": 6534.19677734375,
+      "completions/min_length": 1116.0,
+      "completions/min_terminated_length": 1116.0,
+      "entropy": 0.8867302760481834,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003692191792652011,
+      "learning_rate": 1e-05,
+      "loss": 0.1233,
+      "num_tokens": 273251630.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999606609344482,
+      "sampling/importance_sampling_ratio/min": 0.0031062732450664043,
+      "sampling/sampling_logp_difference/max": 5.774331569671631,
+      "sampling/sampling_logp_difference/mean": 0.019237037748098373,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 3.0103737344688852e-05,
+      "clip_ratio/high_mean": 9.664363972206047e-06,
+      "clip_ratio/low_mean": 1.7575501146893657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.723986426644842e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15786.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 6770.46875,
+      "completions/mean_terminated_length": 6770.46875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.8252957463264465,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004167635925114155,
+      "learning_rate": 1e-05,
+      "loss": -0.0072,
+      "num_tokens": 274146482.0,
+      "reward": 0.5703125,
+      "reward_std": 0.23486016690731049,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.00010247006866848096,
+      "sampling/sampling_logp_difference/max": 9.18593978881836,
+      "sampling/sampling_logp_difference/mean": 0.019684650003910065,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 6.529460733872838e-06,
+      "clip_ratio/high_mean": 1.6323651834682096e-06,
+      "clip_ratio/low_mean": 3.877351048231503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.040587566578324e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15827.0,
+      "completions/mean_length": 8210.859375,
+      "completions/mean_terminated_length": 7365.36181640625,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.8118235394358635,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030363225378096104,
+      "learning_rate": 1e-05,
+      "loss": 0.0531,
+      "num_tokens": 275214040.0,
+      "reward": 0.3515625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998943209648132,
+      "sampling/importance_sampling_ratio/min": 0.002854935359209776,
+      "sampling/sampling_logp_difference/max": 5.858705997467041,
+      "sampling/sampling_logp_difference/mean": 0.019275270402431488,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.0800629146106075e-06,
+      "clip_ratio/high_mean": 1.7700157286526519e-06,
+      "clip_ratio/low_mean": 2.3981688286767167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5751703674359305e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14900.0,
+      "completions/mean_length": 7072.8828125,
+      "completions/mean_terminated_length": 6849.41650390625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8018335327506065,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004777858033776283,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 276138049.0,
+      "reward": 0.453125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368190765381,
+      "sampling/importance_sampling_ratio/min": 0.0028502768836915493,
+      "sampling/sampling_logp_difference/max": 5.860339164733887,
+      "sampling/sampling_logp_difference/mean": 0.01849908009171486,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 2.259368602608447e-05,
+      "clip_ratio/high_mean": 5.648421506521117e-06,
+      "clip_ratio/low_mean": 4.28424866640853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.849090737479855e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14447.0,
+      "completions/mean_length": 5889.8359375,
+      "completions/mean_terminated_length": 5723.26220703125,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.7976400703191757,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030593445990234613,
+      "learning_rate": 1e-05,
+      "loss": 0.1331,
+      "num_tokens": 276910124.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999091029167175,
+      "sampling/importance_sampling_ratio/min": 0.000139843366923742,
+      "sampling/sampling_logp_difference/max": 8.874987602233887,
+      "sampling/sampling_logp_difference/mean": 0.01834402233362198,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 1.4654247024736833e-05,
+      "clip_ratio/high_mean": 3.663561756184208e-06,
+      "clip_ratio/low_mean": 2.377464920755301e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7438210736363544e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 7144.265625,
+      "completions/mean_terminated_length": 6689.85205078125,
+      "completions/min_length": 1200.0,
+      "completions/min_terminated_length": 1200.0,
+      "entropy": 0.8309404999017715,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004245694726705551,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 277843542.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998534321784973,
+      "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05,
+      "sampling/sampling_logp_difference/max": 11.499897956848145,
+      "sampling/sampling_logp_difference/mean": 0.01875344291329384,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 6.252500952541595e-06,
+      "clip_ratio/high_mean": 2.241558604509919e-06,
+      "clip_ratio/low_mean": 4.735765514851664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9599213525652885e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15722.0,
+      "completions/mean_length": 6779.5234375,
+      "completions/mean_terminated_length": 6703.8974609375,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.9584890529513359,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035574575886130333,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 278730129.0,
+      "reward": 0.3984375,
+      "reward_std": 0.32825323939323425,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.005792221520096064,
+      "sampling/sampling_logp_difference/max": 5.151239395141602,
+      "sampling/sampling_logp_difference/mean": 0.02137477695941925,
+      "step": 320
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 278730129,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-320/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-320/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/README.md b/dapo_milora_plus_20251201_131939/checkpoint-384/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-384/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/adapter_config.json
@@ -0,0 +1,40 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-384/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/latest b/dapo_milora_plus_20251201_131939/checkpoint-384/latest
new file mode 100644
index 0000000000000000000000000000000000000000..47a30b050fc0cf5b9cd367ab63c36191546d4ff7
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/latest
@@ -0,0 +1 @@
+global_step384
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-384/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-384/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..779f29fd3b8eb44e5067bf4a00b20b8c4015fbb7
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json
@@ -0,0 +1,11938 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3532658693652254,
+  "eval_steps": 500,
+  "global_step": 384,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 9.941184316630824e-06,
+      "clip_ratio/high_mean": 2.485296079157706e-06,
+      "clip_ratio/low_mean": 2.6134909091979353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8620205910101504e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 8426.1015625,
+      "completions/mean_terminated_length": 7965.72705078125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8188603445887566,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030983765609562397,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 162199765.0,
+      "reward": 0.25,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411106109619,
+      "sampling/importance_sampling_ratio/min": 0.0009119694004766643,
+      "sampling/sampling_logp_difference/max": 6.999904155731201,
+      "sampling/sampling_logp_difference/mean": 0.02070600539445877,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 2.612139087432297e-05,
+      "clip_ratio/high_mean": 6.530347718580742e-06,
+      "clip_ratio/low_mean": 3.7853451885894174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.438379949078808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15904.0,
+      "completions/mean_length": 7154.2109375,
+      "completions/mean_terminated_length": 6856.4755859375,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "entropy": 0.9913735538721085,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003430198412388563,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 163133232.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2120065689086914,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000275373458862,
+      "sampling/importance_sampling_ratio/min": 0.00042929715709760785,
+      "sampling/sampling_logp_difference/max": 7.753361225128174,
+      "sampling/sampling_logp_difference/mean": 0.02190260961651802,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 3.1841454983805306e-06,
+      "clip_ratio/high_mean": 7.960363745951327e-07,
+      "clip_ratio/low_mean": 3.384581600585079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4641852380445926e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 7693.1328125,
+      "completions/mean_terminated_length": 7412.7822265625,
+      "completions/min_length": 1077.0,
+      "completions/min_terminated_length": 1077.0,
+      "entropy": 0.9887127950787544,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002780586015433073,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 164134393.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999028444290161,
+      "sampling/importance_sampling_ratio/min": 3.559096626304381e-07,
+      "sampling/sampling_logp_difference/max": 14.848588943481445,
+      "sampling/sampling_logp_difference/mean": 0.021110571920871735,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 9.770586984814145e-06,
+      "clip_ratio/high_mean": 5.008155312680174e-06,
+      "clip_ratio/low_mean": 5.182203130971175e-05,
+      "clip_ratio/low_min": 1.5574546068819473e-05,
+      "clip_ratio/region_mean": 5.683018616764457e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 7072.1484375,
+      "completions/mean_terminated_length": 6771.76611328125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.861792616546154,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030156150460243225,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 165063412.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998926520347595,
+      "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06,
+      "sampling/sampling_logp_difference/max": 12.999247550964355,
+      "sampling/sampling_logp_difference/mean": 0.019325289875268936,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 2.2510209873871645e-05,
+      "clip_ratio/high_mean": 6.455301331698138e-06,
+      "clip_ratio/low_mean": 6.156819108582567e-05,
+      "clip_ratio/low_min": 5.763157332694391e-06,
+      "clip_ratio/region_mean": 6.802349253121065e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15062.0,
+      "completions/mean_length": 7353.421875,
+      "completions/mean_terminated_length": 7062.11279296875,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "entropy": 0.8961873054504395,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034921523183584213,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 166024306.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.0005124400486238301,
+      "sampling/sampling_logp_difference/max": 7.576326847076416,
+      "sampling/sampling_logp_difference/mean": 0.019593238830566406,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.3040991007073899e-05,
+      "clip_ratio/high_mean": 4.292725350296678e-06,
+      "clip_ratio/low_mean": 5.347559840629401e-05,
+      "clip_ratio/low_min": 6.613406640099129e-06,
+      "clip_ratio/region_mean": 5.776832381343411e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15604.0,
+      "completions/mean_length": 7348.03125,
+      "completions/mean_terminated_length": 6903.63916015625,
+      "completions/min_length": 1619.0,
+      "completions/min_terminated_length": 1619.0,
+      "entropy": 0.824029266834259,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027784397825598717,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 166984982.0,
+      "reward": 0.40625,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 0.0010020677000284195,
+      "sampling/sampling_logp_difference/max": 6.905689716339111,
+      "sampling/sampling_logp_difference/mean": 0.01857386901974678,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 3.330808067403268e-05,
+      "clip_ratio/high_mean": 1.0969530649163062e-05,
+      "clip_ratio/low_mean": 3.2080681648949394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3050211388617754e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16358.0,
+      "completions/mean_length": 7290.4765625,
+      "completions/mean_terminated_length": 6920.82080078125,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8884479627013206,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004110465291887522,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 167936971.0,
+      "reward": 0.4375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06,
+      "sampling/sampling_logp_difference/max": 13.219663619995117,
+      "sampling/sampling_logp_difference/mean": 0.019696572795510292,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 9.77357763076725e-06,
+      "clip_ratio/high_mean": 2.4433944076918124e-06,
+      "clip_ratio/low_mean": 3.466498992565903e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.710838473125477e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 7803.625,
+      "completions/mean_terminated_length": 6833.66943359375,
+      "completions/min_length": 929.0,
+      "completions/min_terminated_length": 929.0,
+      "entropy": 0.8326860442757607,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002410614863038063,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "num_tokens": 168955683.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0008801451185718179,
+      "sampling/sampling_logp_difference/max": 7.035423755645752,
+      "sampling/sampling_logp_difference/mean": 0.018545793369412422,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.4602125929741305e-05,
+      "clip_ratio/high_mean": 3.6505314824353263e-06,
+      "clip_ratio/low_mean": 3.4781527119776e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8432058772741584e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6804.34375,
+      "completions/mean_terminated_length": 6495.322265625,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.9669496119022369,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034376555122435093,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 169845823.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31534504890441895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 1.767780588579626e-08,
+      "sampling/sampling_logp_difference/max": 17.850955963134766,
+      "sampling/sampling_logp_difference/mean": 0.020515555515885353,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 1.5814722473805887e-05,
+      "clip_ratio/high_mean": 3.953680618451472e-06,
+      "clip_ratio/low_mean": 3.574208744794305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9695768407455034e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 6827.9609375,
+      "completions/mean_terminated_length": 6105.23583984375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 0.8833946585655212,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026675171684473753,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 170738210.0,
+      "reward": 0.421875,
+      "reward_std": 0.2698654532432556,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000019907951355,
+      "sampling/importance_sampling_ratio/min": 0.002906275913119316,
+      "sampling/sampling_logp_difference/max": 5.840882778167725,
+      "sampling/sampling_logp_difference/mean": 0.019948139786720276,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.6623121837255894e-05,
+      "clip_ratio/high_mean": 4.1557804593139736e-06,
+      "clip_ratio/low_mean": 6.462372630267055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.877950727357529e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15725.0,
+      "completions/mean_length": 7377.984375,
+      "completions/mean_terminated_length": 7307.07080078125,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8881714344024658,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0039620306342840195,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 171705152.0,
+      "reward": 0.3359375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05,
+      "sampling/sampling_logp_difference/max": 10.614632606506348,
+      "sampling/sampling_logp_difference/mean": 0.01964445412158966,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 9.639111340220552e-06,
+      "clip_ratio/high_mean": 2.409777835055138e-06,
+      "clip_ratio/low_mean": 2.775239624952519e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0162174198267167e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15265.0,
+      "completions/mean_length": 6051.8828125,
+      "completions/mean_terminated_length": 5543.74560546875,
+      "completions/min_length": 819.0,
+      "completions/min_terminated_length": 819.0,
+      "entropy": 0.8851477280259132,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0040458571165800095,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 172501881.0,
+      "reward": 0.4296875,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999410510063171,
+      "sampling/importance_sampling_ratio/min": 0.0021976607386022806,
+      "sampling/sampling_logp_difference/max": 6.120361804962158,
+      "sampling/sampling_logp_difference/mean": 0.01957303285598755,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 9.72708312474424e-06,
+      "clip_ratio/high_mean": 3.529455852913088e-06,
+      "clip_ratio/low_mean": 5.158422732165491e-05,
+      "clip_ratio/low_min": 1.1939961495954776e-05,
+      "clip_ratio/region_mean": 5.5113683174567996e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7830.171875,
+      "completions/mean_terminated_length": 7409.4912109375,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.9070459827780724,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005941574461758137,
+      "learning_rate": 1e-05,
+      "loss": 0.0427,
+      "num_tokens": 173522391.0,
+      "reward": 0.34375,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000017881393433,
+      "sampling/importance_sampling_ratio/min": 0.00011712420382536948,
+      "sampling/sampling_logp_difference/max": 9.052275657653809,
+      "sampling/sampling_logp_difference/mean": 0.021295130252838135,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 5.5543214330100454e-06,
+      "clip_ratio/high_mean": 1.3885803582525114e-06,
+      "clip_ratio/low_mean": 1.718775109793569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8576331683561875e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15443.0,
+      "completions/mean_length": 7520.6796875,
+      "completions/mean_terminated_length": 6769.55078125,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.8843575045466423,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025851845275610685,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 174504534.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 0.00039556476986035705,
+      "sampling/sampling_logp_difference/max": 7.835196018218994,
+      "sampling/sampling_logp_difference/mean": 0.02016005665063858,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 1.0145481155632297e-05,
+      "clip_ratio/high_mean": 2.536370288908074e-06,
+      "clip_ratio/low_mean": 3.617897255026037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.871534295285528e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 7382.1875,
+      "completions/mean_terminated_length": 6861.42138671875,
+      "completions/min_length": 934.0,
+      "completions/min_terminated_length": 934.0,
+      "entropy": 0.916313610970974,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004170550964772701,
+      "learning_rate": 1e-05,
+      "loss": 0.047,
+      "num_tokens": 175472574.0,
+      "reward": 0.46875,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05,
+      "sampling/sampling_logp_difference/max": 10.481352806091309,
+      "sampling/sampling_logp_difference/mean": 0.020749717950820923,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.83663013963087e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.83663013963087e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 6122.453125,
+      "completions/mean_terminated_length": 6041.6533203125,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.8984386026859283,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 176275568.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 7.88934721640544e-06,
+      "sampling/sampling_logp_difference/max": 11.74999713897705,
+      "sampling/sampling_logp_difference/mean": 0.020278753712773323,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 1.4535152331518475e-05,
+      "clip_ratio/high_mean": 3.6337880828796187e-06,
+      "clip_ratio/low_mean": 4.3961883989140915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7595671958333696e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15547.0,
+      "completions/mean_length": 4983.2890625,
+      "completions/mean_terminated_length": 4709.67236328125,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.825260303914547,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004848882555961609,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 176932549.0,
+      "reward": 0.6484375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616146087646,
+      "sampling/importance_sampling_ratio/min": 1.626804078114219e-05,
+      "sampling/sampling_logp_difference/max": 11.026308059692383,
+      "sampling/sampling_logp_difference/mean": 0.017959970980882645,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.1141860795760294e-05,
+      "clip_ratio/high_mean": 2.7854651989400736e-06,
+      "clip_ratio/low_mean": 4.2418692146384274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5204157913758536e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 5766.5234375,
+      "completions/mean_terminated_length": 5511.7041015625,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.9016259610652924,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004749474115669727,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 177691752.0,
+      "reward": 0.5,
+      "reward_std": 0.2738044261932373,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000141859054565,
+      "sampling/importance_sampling_ratio/min": 8.927558155846782e-06,
+      "sampling/sampling_logp_difference/max": 11.626367568969727,
+      "sampling/sampling_logp_difference/mean": 0.019118282943964005,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 5.5243735914700665e-06,
+      "clip_ratio/high_mean": 2.1587275114143267e-06,
+      "clip_ratio/low_mean": 4.609663824339805e-05,
+      "clip_ratio/low_min": 3.983555870945565e-06,
+      "clip_ratio/region_mean": 4.8255366664307076e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15696.0,
+      "completions/mean_length": 6993.671875,
+      "completions/mean_terminated_length": 6768.30419921875,
+      "completions/min_length": 889.0,
+      "completions/min_terminated_length": 889.0,
+      "entropy": 0.9074988812208176,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004418120253831148,
+      "learning_rate": 1e-05,
+      "loss": 0.1135,
+      "num_tokens": 178603454.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000037670135498,
+      "sampling/importance_sampling_ratio/min": 0.0018135923892259598,
+      "sampling/sampling_logp_difference/max": 6.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01957814022898674,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 5.126943051436683e-06,
+      "clip_ratio/high_mean": 1.2817357628591708e-06,
+      "clip_ratio/low_mean": 2.7488794444252562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.877053032079857e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 7445.1328125,
+      "completions/mean_terminated_length": 6849.20849609375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9255013465881348,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00237120408564806,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 179577063.0,
+      "reward": 0.40625,
+      "reward_std": 0.21040897071361542,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725818634033,
+      "sampling/importance_sampling_ratio/min": 9.651589061832055e-05,
+      "sampling/sampling_logp_difference/max": 9.245802879333496,
+      "sampling/sampling_logp_difference/mean": 0.02165937051177025,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.8956294752570102e-05,
+      "clip_ratio/high_mean": 4.7390736881425255e-06,
+      "clip_ratio/low_mean": 2.6486316301088664e-05,
+      "clip_ratio/low_min": 3.516273409331916e-06,
+      "clip_ratio/region_mean": 3.122539010291803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6120.5546875,
+      "completions/mean_terminated_length": 5703.34130859375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8181199952960014,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004715202376246452,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 180380422.0,
+      "reward": 0.5,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999874472618103,
+      "sampling/importance_sampling_ratio/min": 0.004350374918431044,
+      "sampling/sampling_logp_difference/max": 5.437493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018377620726823807,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 5.594843969447538e-06,
+      "clip_ratio/high_mean": 2.376495558564784e-06,
+      "clip_ratio/low_mean": 3.4097628713425365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6474124044616474e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 6351.203125,
+      "completions/mean_terminated_length": 5857.78662109375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.8798654451966286,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003063712501898408,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 181212776.0,
+      "reward": 0.453125,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999946355819702,
+      "sampling/importance_sampling_ratio/min": 7.891544555604924e-06,
+      "sampling/sampling_logp_difference/max": 11.74971866607666,
+      "sampling/sampling_logp_difference/mean": 0.019523698836565018,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.544438988001275e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.544438988001275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14180.0,
+      "completions/mean_length": 6330.046875,
+      "completions/mean_terminated_length": 6170.46044921875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.8319354206323624,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033194730058312416,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 182041910.0,
+      "reward": 0.453125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998994469642639,
+      "sampling/importance_sampling_ratio/min": 0.00010535263572819531,
+      "sampling/sampling_logp_difference/max": 9.158197402954102,
+      "sampling/sampling_logp_difference/mean": 0.018981872126460075,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7156292415165808e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7156292415165808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15982.0,
+      "completions/mean_length": 6665.2890625,
+      "completions/mean_terminated_length": 6351.7822265625,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9336326420307159,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004492956213653088,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 182914843.0,
+      "reward": 0.3828125,
+      "reward_std": 0.14807432889938354,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 0.011399568989872932,
+      "sampling/sampling_logp_difference/max": 4.474179744720459,
+      "sampling/sampling_logp_difference/mean": 0.02088768407702446,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.2495465802639956e-05,
+      "clip_ratio/high_mean": 9.084843100026774e-06,
+      "clip_ratio/low_mean": 5.4809036328151706e-05,
+      "clip_ratio/low_min": 8.953898031904828e-06,
+      "clip_ratio/region_mean": 6.389387954186532e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16064.0,
+      "completions/mean_length": 5393.9140625,
+      "completions/mean_terminated_length": 5039.39501953125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.7864786610007286,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003816079581156373,
+      "learning_rate": 1e-05,
+      "loss": -0.004,
+      "num_tokens": 183628152.0,
+      "reward": 0.546875,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998779892921448,
+      "sampling/importance_sampling_ratio/min": 0.003246711567044258,
+      "sampling/sampling_logp_difference/max": 5.730112552642822,
+      "sampling/sampling_logp_difference/mean": 0.018448319286108017,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 8.638648068881594e-06,
+      "clip_ratio/high_mean": 2.1596620172203984e-06,
+      "clip_ratio/low_mean": 1.6896704778446292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9056366909353528e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 7161.5,
+      "completions/mean_terminated_length": 7015.111328125,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.915394201874733,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003666195785626769,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 184562352.0,
+      "reward": 0.3671875,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00025550799909979105,
+      "sampling/sampling_logp_difference/max": 8.272256851196289,
+      "sampling/sampling_logp_difference/mean": 0.019755780696868896,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 6.424931598303374e-06,
+      "clip_ratio/high_mean": 1.6062328995758435e-06,
+      "clip_ratio/low_mean": 2.49038239417132e-05,
+      "clip_ratio/low_min": 4.00025601265952e-06,
+      "clip_ratio/region_mean": 2.651005689813246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15408.0,
+      "completions/mean_length": 7957.671875,
+      "completions/mean_terminated_length": 7685.8544921875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "entropy": 1.1176252663135529,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025940234772861004,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 185606670.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999893844127655,
+      "sampling/importance_sampling_ratio/min": 0.0007622809498570859,
+      "sampling/sampling_logp_difference/max": 7.179195404052734,
+      "sampling/sampling_logp_difference/mean": 0.02338646724820137,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 1.9903963220713194e-05,
+      "clip_ratio/high_mean": 5.829163114867697e-06,
+      "clip_ratio/low_mean": 4.4742550926457625e-05,
+      "clip_ratio/low_min": 3.5803282116830815e-06,
+      "clip_ratio/region_mean": 5.057171370026481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 7060.6640625,
+      "completions/mean_terminated_length": 6759.9111328125,
+      "completions/min_length": 1460.0,
+      "completions/min_terminated_length": 1460.0,
+      "entropy": 0.9148540124297142,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004315398633480072,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 186526883.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0004585353017318994,
+      "sampling/sampling_logp_difference/max": 7.687473297119141,
+      "sampling/sampling_logp_difference/mean": 0.01967843994498253,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 1.147099328591139e-05,
+      "clip_ratio/high_mean": 2.8677483214778476e-06,
+      "clip_ratio/low_mean": 2.8967988555450574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1835736763241584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15596.0,
+      "completions/mean_length": 6649.6640625,
+      "completions/mean_terminated_length": 6416.04052734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9298559054732323,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030786178540438414,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 187397536.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000005841255188,
+      "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07,
+      "sampling/sampling_logp_difference/max": 14.929608345031738,
+      "sampling/sampling_logp_difference/mean": 0.020215414464473724,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 2.2768570943298982e-05,
+      "clip_ratio/high_mean": 5.692142735824746e-06,
+      "clip_ratio/low_mean": 3.249637484259438e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8188517464732286e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 8292.015625,
+      "completions/mean_terminated_length": 7823.8837890625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.8232023045420647,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002438523108139634,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 188477778.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.005636279005557299,
+      "sampling/sampling_logp_difference/max": 5.178531169891357,
+      "sampling/sampling_logp_difference/mean": 0.018984414637088776,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 2.0840709566982696e-05,
+      "clip_ratio/high_mean": 6.135253556749376e-06,
+      "clip_ratio/low_mean": 2.255633432923787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.869158777230041e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15991.0,
+      "completions/mean_length": 7600.9765625,
+      "completions/mean_terminated_length": 6936.71484375,
+      "completions/min_length": 995.0,
+      "completions/min_terminated_length": 995.0,
+      "entropy": 0.8689917623996735,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004773247055709362,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 189470655.0,
+      "reward": 0.40625,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.001327168894931674,
+      "sampling/sampling_logp_difference/max": 6.624707221984863,
+      "sampling/sampling_logp_difference/mean": 0.018666012212634087,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 9.837458947004052e-06,
+      "clip_ratio/high_mean": 2.459364736751013e-06,
+      "clip_ratio/low_mean": 6.463955219260242e-05,
+      "clip_ratio/low_min": 1.0895145351241808e-05,
+      "clip_ratio/region_mean": 6.70989177251613e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16215.0,
+      "completions/mean_length": 7600.34375,
+      "completions/mean_terminated_length": 6855.96630859375,
+      "completions/min_length": 1335.0,
+      "completions/min_terminated_length": 1335.0,
+      "entropy": 0.7636929750442505,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004298723768442869,
+      "learning_rate": 1e-05,
+      "loss": 0.145,
+      "num_tokens": 190462227.0,
+      "reward": 0.515625,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999310374259949,
+      "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05,
+      "sampling/sampling_logp_difference/max": 9.996363639831543,
+      "sampling/sampling_logp_difference/mean": 0.018035393208265305,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 1.4060602325116633e-05,
+      "clip_ratio/high_mean": 3.5151505812791584e-06,
+      "clip_ratio/low_mean": 2.6516039497437305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.003119024924672e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15151.0,
+      "completions/mean_length": 6512.0,
+      "completions/mean_terminated_length": 6434.267578125,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.9043584689497948,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006741553544998169,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 191312483.0,
+      "reward": 0.484375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 1.778468504198827e-05,
+      "sampling/sampling_logp_difference/max": 10.937172889709473,
+      "sampling/sampling_logp_difference/mean": 0.020878732204437256,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 1.7356085209030425e-05,
+      "clip_ratio/high_mean": 4.339021302257606e-06,
+      "clip_ratio/low_mean": 2.8831826739406097e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.317084781429003e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7178.6875,
+      "completions/mean_terminated_length": 6565.00048828125,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.8899475410580635,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00281486171297729,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 192251235.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2240736484527588,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999714493751526,
+      "sampling/importance_sampling_ratio/min": 9.012543159769848e-05,
+      "sampling/sampling_logp_difference/max": 9.314308166503906,
+      "sampling/sampling_logp_difference/mean": 0.020196784287691116,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.5558084214717383e-05,
+      "clip_ratio/high_mean": 3.889521053679346e-06,
+      "clip_ratio/low_mean": 3.0248688972278615e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.413820991227112e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15501.0,
+      "completions/max_terminated_length": 15501.0,
+      "completions/mean_length": 6602.5625,
+      "completions/mean_terminated_length": 6602.5625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.9266818463802338,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005070593673735857,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193116763.0,
+      "reward": 0.53125,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999746680259705,
+      "sampling/importance_sampling_ratio/min": 2.726537559283315e-06,
+      "sampling/sampling_logp_difference/max": 12.812478065490723,
+      "sampling/sampling_logp_difference/mean": 0.020026464015245438,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.188727416476468e-06,
+      "clip_ratio/high_mean": 1.047181854119117e-06,
+      "clip_ratio/low_mean": 2.959152834591805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.063871008635033e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6818.8828125,
+      "completions/mean_terminated_length": 6430.056640625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.874519519507885,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006362155079841614,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 194007868.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009298324585,
+      "sampling/importance_sampling_ratio/min": 0.0005216691642999649,
+      "sampling/sampling_logp_difference/max": 7.55847692489624,
+      "sampling/sampling_logp_difference/mean": 0.01943325623869896,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 9.645911177358357e-06,
+      "clip_ratio/high_mean": 2.4114777943395893e-06,
+      "clip_ratio/low_mean": 6.821557258263056e-05,
+      "clip_ratio/low_min": 1.7265090718865395e-05,
+      "clip_ratio/region_mean": 7.062705049065698e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14536.0,
+      "completions/mean_length": 5515.625,
+      "completions/mean_terminated_length": 5343.111328125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0683523043990135,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003797185141593218,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "num_tokens": 194735980.0,
+      "reward": 0.421875,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 1.137102216830499e-07,
+      "sampling/sampling_logp_difference/max": 15.989612579345703,
+      "sampling/sampling_logp_difference/mean": 0.02120930328965187,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.1971412252241862e-05,
+      "clip_ratio/high_mean": 5.4928530630604655e-06,
+      "clip_ratio/low_mean": 4.9151800567415194e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4644653801005916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 5853.546875,
+      "completions/mean_terminated_length": 5770.6298828125,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.7975900694727898,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004124365746974945,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 195504882.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000672340393066,
+      "sampling/importance_sampling_ratio/min": 0.0032877910416573286,
+      "sampling/sampling_logp_difference/max": 5.717539310455322,
+      "sampling/sampling_logp_difference/mean": 0.017819223925471306,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 7.066538728395244e-06,
+      "clip_ratio/high_mean": 2.843255515472265e-06,
+      "clip_ratio/low_mean": 5.1467116236381116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.431037175185338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6686.25,
+      "completions/mean_terminated_length": 6532.31787109375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.9018580466508865,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024995009880512953,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 196379306.0,
+      "reward": 0.421875,
+      "reward_std": 0.35824593901634216,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999300837516785,
+      "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05,
+      "sampling/sampling_logp_difference/max": 10.818918228149414,
+      "sampling/sampling_logp_difference/mean": 0.018989525735378265,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 6.652828687947476e-06,
+      "clip_ratio/high_mean": 2.5722979444253724e-06,
+      "clip_ratio/low_mean": 3.699686294567073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95691608900961e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16347.0,
+      "completions/mean_length": 7487.3359375,
+      "completions/mean_terminated_length": 7200.3466796875,
+      "completions/min_length": 1222.0,
+      "completions/min_terminated_length": 1222.0,
+      "entropy": 0.9890001565217972,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004295211285352707,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 197357397.0,
+      "reward": 0.40625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0006548459641635418,
+      "sampling/sampling_logp_difference/max": 7.33111047744751,
+      "sampling/sampling_logp_difference/mean": 0.02209121733903885,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 6.0850939007650595e-06,
+      "clip_ratio/high_mean": 1.5212734751912649e-06,
+      "clip_ratio/low_mean": 2.9443070673096372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0964344205131056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7233.484375,
+      "completions/mean_terminated_length": 6938.30615234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.9683803990483284,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003119673579931259,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 198303795.0,
+      "reward": 0.328125,
+      "reward_std": 0.23014704883098602,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000243186950684,
+      "sampling/importance_sampling_ratio/min": 0.020358745008707047,
+      "sampling/sampling_logp_difference/max": 3.89424467086792,
+      "sampling/sampling_logp_difference/mean": 0.021085180342197418,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.963812095113099e-06,
+      "clip_ratio/high_mean": 1.9909530237782747e-06,
+      "clip_ratio/low_mean": 4.031422963635123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.23051826601295e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15733.0,
+      "completions/mean_length": 6457.78125,
+      "completions/mean_terminated_length": 6300.22265625,
+      "completions/min_length": 850.0,
+      "completions/min_terminated_length": 850.0,
+      "entropy": 0.8881053999066353,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033790848683565855,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 199154735.0,
+      "reward": 0.3828125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998799562454224,
+      "sampling/importance_sampling_ratio/min": 2.872048128210736e-07,
+      "sampling/sampling_logp_difference/max": 15.063070297241211,
+      "sampling/sampling_logp_difference/mean": 0.01950821653008461,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 9.059622016138746e-06,
+      "clip_ratio/high_mean": 3.3430123380639998e-06,
+      "clip_ratio/low_mean": 2.2856192117615137e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6199204512522556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 7904.40625,
+      "completions/mean_terminated_length": 7769.81005859375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9881557524204254,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021492803934961557,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 200185643.0,
+      "reward": 0.359375,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001094341278076,
+      "sampling/importance_sampling_ratio/min": 0.001458622980862856,
+      "sampling/sampling_logp_difference/max": 6.530262470245361,
+      "sampling/sampling_logp_difference/mean": 0.021201875060796738,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 6.9962839006620925e-06,
+      "clip_ratio/high_mean": 1.7490709751655231e-06,
+      "clip_ratio/low_mean": 3.018811844412994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193718976035598e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 7414.4921875,
+      "completions/mean_terminated_length": 7414.4921875,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "entropy": 0.9571134969592094,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037221095990389585,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 201153114.0,
+      "reward": 0.4375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999958872795105,
+      "sampling/importance_sampling_ratio/min": 0.0009130563121289015,
+      "sampling/sampling_logp_difference/max": 6.99871301651001,
+      "sampling/sampling_logp_difference/mean": 0.021356744691729546,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 1.1248092050664127e-05,
+      "clip_ratio/high_mean": 2.8120230126660317e-06,
+      "clip_ratio/low_mean": 5.4354991334548686e-05,
+      "clip_ratio/low_min": 6.868132004456129e-06,
+      "clip_ratio/region_mean": 5.716701480196207e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15835.0,
+      "completions/max_terminated_length": 15835.0,
+      "completions/mean_length": 5955.953125,
+      "completions/mean_terminated_length": 5955.953125,
+      "completions/min_length": 1394.0,
+      "completions/min_terminated_length": 1394.0,
+      "entropy": 0.730999618768692,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006285305600613356,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 201933044.0,
+      "reward": 0.59375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.007535050623118877,
+      "sampling/sampling_logp_difference/max": 4.888189792633057,
+      "sampling/sampling_logp_difference/mean": 0.016975615173578262,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 7.226686648209579e-06,
+      "clip_ratio/high_mean": 3.094216481258627e-06,
+      "clip_ratio/low_mean": 4.66828214484849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.977703792974353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6923.3515625,
+      "completions/mean_terminated_length": 6458.0732421875,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9938417226076126,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005667983554303646,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 202837281.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05,
+      "sampling/sampling_logp_difference/max": 10.402952194213867,
+      "sampling/sampling_logp_difference/mean": 0.022059854120016098,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 5.2318769121484365e-06,
+      "clip_ratio/high_mean": 1.3079692280371091e-06,
+      "clip_ratio/low_mean": 4.239228087499214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3700250216716086e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14726.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 5930.9296875,
+      "completions/mean_terminated_length": 5930.9296875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.8100385963916779,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004052883945405483,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 203614448.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999989926815033,
+      "sampling/importance_sampling_ratio/min": 0.00015170808183029294,
+      "sampling/sampling_logp_difference/max": 8.79355239868164,
+      "sampling/sampling_logp_difference/mean": 0.018519222736358643,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 4.905230980511988e-06,
+      "clip_ratio/high_mean": 1.226307745127997e-06,
+      "clip_ratio/low_mean": 5.500513248080097e-05,
+      "clip_ratio/low_min": 7.924934834591113e-06,
+      "clip_ratio/region_mean": 5.6231440112242126e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14996.0,
+      "completions/mean_length": 6911.1015625,
+      "completions/mean_terminated_length": 6108.3134765625,
+      "completions/min_length": 862.0,
+      "completions/min_terminated_length": 862.0,
+      "entropy": 0.9260227829217911,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004494607914239168,
+      "learning_rate": 1e-05,
+      "loss": 0.0269,
+      "num_tokens": 204518261.0,
+      "reward": 0.4140625,
+      "reward_std": 0.34033796191215515,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998886585235596,
+      "sampling/importance_sampling_ratio/min": 0.0015266009140759706,
+      "sampling/sampling_logp_difference/max": 6.484711647033691,
+      "sampling/sampling_logp_difference/mean": 0.020527629181742668,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 8.293764039990492e-06,
+      "clip_ratio/high_mean": 2.073441009997623e-06,
+      "clip_ratio/low_mean": 4.75325257411896e-05,
+      "clip_ratio/low_min": 3.599504680096288e-06,
+      "clip_ratio/region_mean": 4.960596663750039e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14637.0,
+      "completions/mean_length": 6972.921875,
+      "completions/mean_terminated_length": 6823.5400390625,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 1.0095533654093742,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029451537411659956,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 205433843.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05,
+      "sampling/sampling_logp_difference/max": 10.53177547454834,
+      "sampling/sampling_logp_difference/mean": 0.02013089321553707,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 4.163383164268453e-05,
+      "clip_ratio/high_mean": 1.382379150527413e-05,
+      "clip_ratio/low_mean": 3.86000854177837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2423876240936806e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 6706.6640625,
+      "completions/mean_terminated_length": 6313.2763671875,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 0.8647518903017044,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003371767932549119,
+      "learning_rate": 1e-05,
+      "loss": 0.073,
+      "num_tokens": 206310296.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 2.948181463580113e-05,
+      "sampling/sampling_logp_difference/max": 10.431736946105957,
+      "sampling/sampling_logp_difference/mean": 0.019770190119743347,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4946740381892596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4946740381892596e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 6882.609375,
+      "completions/mean_terminated_length": 6415.32763671875,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "entropy": 1.013342760503292,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016336971893906593,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 207210974.0,
+      "reward": 0.359375,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999210834503174,
+      "sampling/importance_sampling_ratio/min": 0.0013267879839986563,
+      "sampling/sampling_logp_difference/max": 6.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.02139991894364357,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.4866403944324702e-05,
+      "clip_ratio/high_mean": 3.7166009860811755e-06,
+      "clip_ratio/low_mean": 3.938925010515959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.310585177336179e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15203.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 6195.7421875,
+      "completions/mean_terminated_length": 6195.7421875,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.8448907434940338,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005036406684666872,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208021893.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955892562866,
+      "sampling/importance_sampling_ratio/min": 0.0040348549373447895,
+      "sampling/sampling_logp_difference/max": 5.512784957885742,
+      "sampling/sampling_logp_difference/mean": 0.018679853528738022,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.1244883353356272e-05,
+      "clip_ratio/high_mean": 2.811220838339068e-06,
+      "clip_ratio/low_mean": 3.422392001084518e-05,
+      "clip_ratio/low_min": 6.451612989621935e-06,
+      "clip_ratio/region_mean": 3.703514119024476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6829.609375,
+      "completions/mean_terminated_length": 6521.40283203125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.8679579794406891,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029643685556948185,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 208912059.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00038063788088038564,
+      "sampling/sampling_logp_difference/max": 7.873661994934082,
+      "sampling/sampling_logp_difference/mean": 0.018488366156816483,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 2.2700600311509334e-05,
+      "clip_ratio/high_mean": 5.675150077877333e-06,
+      "clip_ratio/low_mean": 3.138338854569156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.705853873725573e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14503.0,
+      "completions/max_terminated_length": 14503.0,
+      "completions/mean_length": 5444.4453125,
+      "completions/mean_terminated_length": 5444.4453125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0460086688399315,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035942886024713516,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "num_tokens": 209627804.0,
+      "reward": 0.484375,
+      "reward_std": 0.338498055934906,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997478723526,
+      "sampling/importance_sampling_ratio/min": 0.03179635480046272,
+      "sampling/sampling_logp_difference/max": 3.4484035968780518,
+      "sampling/sampling_logp_difference/mean": 0.020146891474723816,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.477029400120955e-05,
+      "clip_ratio/high_mean": 4.552578502625693e-06,
+      "clip_ratio/low_mean": 5.265122354103369e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.720380158891203e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16244.0,
+      "completions/mean_length": 7657.390625,
+      "completions/mean_terminated_length": 7152.544921875,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 0.9528728649020195,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0044983453117311,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 210630150.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000007152557373,
+      "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05,
+      "sampling/sampling_logp_difference/max": 10.158285140991211,
+      "sampling/sampling_logp_difference/mean": 0.02131088823080063,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 8.607642712377128e-06,
+      "clip_ratio/high_mean": 2.151910678094282e-06,
+      "clip_ratio/low_mean": 2.2759413695894182e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.491132454451872e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7574.3515625,
+      "completions/mean_terminated_length": 7504.984375,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 1.0009776800870895,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006095650140196085,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 211620355.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 0.0013946897815912962,
+      "sampling/sampling_logp_difference/max": 6.575083255767822,
+      "sampling/sampling_logp_difference/mean": 0.021727774292230606,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.764823082339717e-05,
+      "clip_ratio/high_mean": 5.141430960975413e-06,
+      "clip_ratio/low_mean": 5.936152001595474e-05,
+      "clip_ratio/low_min": 9.155588486464694e-06,
+      "clip_ratio/region_mean": 6.450295177273802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14915.0,
+      "completions/mean_length": 7919.6875,
+      "completions/mean_terminated_length": 7716.54443359375,
+      "completions/min_length": 1517.0,
+      "completions/min_terminated_length": 1517.0,
+      "entropy": 1.0405654236674309,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037038614973425865,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 212654747.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381899833679,
+      "sampling/importance_sampling_ratio/min": 0.0057550109922885895,
+      "sampling/sampling_logp_difference/max": 5.157684326171875,
+      "sampling/sampling_logp_difference/mean": 0.022051017731428146,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.265254240934155e-05,
+      "clip_ratio/high_mean": 3.1631356023353874e-06,
+      "clip_ratio/low_mean": 4.716233138424286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.032546687289141e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 8613.4765625,
+      "completions/mean_terminated_length": 7735.0693359375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.890489287674427,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325607368722558,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 213774584.0,
+      "reward": 0.40625,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000060796737671,
+      "sampling/importance_sampling_ratio/min": 1.670176425250247e-05,
+      "sampling/sampling_logp_difference/max": 10.999996185302734,
+      "sampling/sampling_logp_difference/mean": 0.020002499222755432,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.6404605503339553e-05,
+      "clip_ratio/high_mean": 4.101151375834888e-06,
+      "clip_ratio/low_mean": 3.880500707964529e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2906158682853857e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7324.8984375,
+      "completions/mean_terminated_length": 6473.1884765625,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 0.761004202067852,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038265211042016745,
+      "learning_rate": 1e-05,
+      "loss": 0.0717,
+      "num_tokens": 214728371.0,
+      "reward": 0.515625,
+      "reward_std": 0.32719239592552185,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000168085098267,
+      "sampling/importance_sampling_ratio/min": 0.0003049026126973331,
+      "sampling/sampling_logp_difference/max": 8.095518112182617,
+      "sampling/sampling_logp_difference/mean": 0.018367979675531387,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 5.624549885396846e-06,
+      "clip_ratio/high_mean": 1.4061374713492114e-06,
+      "clip_ratio/low_mean": 3.6433707123251224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7839844594600436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14167.0,
+      "completions/max_terminated_length": 14167.0,
+      "completions/mean_length": 6422.0859375,
+      "completions/mean_terminated_length": 6422.0859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.9946094751358032,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002729539293795824,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 215570806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.026308411732316017,
+      "sampling/sampling_logp_difference/max": 3.637866497039795,
+      "sampling/sampling_logp_difference/mean": 0.021903935819864273,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 7.2379848461423535e-06,
+      "clip_ratio/high_mean": 1.8094962115355884e-06,
+      "clip_ratio/low_mean": 3.17277934982485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353728982347093e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15585.0,
+      "completions/mean_length": 6845.2890625,
+      "completions/mean_terminated_length": 6693.88134765625,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8822609707713127,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004974282346665859,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 216465635.0,
+      "reward": 0.5390625,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 8.749838889343664e-05,
+      "sampling/sampling_logp_difference/max": 9.343890190124512,
+      "sampling/sampling_logp_difference/mean": 0.019389234483242035,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.58592818024772e-05,
+      "clip_ratio/high_mean": 3.9648204506193e-06,
+      "clip_ratio/low_mean": 4.096964960353944e-05,
+      "clip_ratio/low_min": 1.7403560605089297e-05,
+      "clip_ratio/region_mean": 4.49344687467601e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 7805.484375,
+      "completions/mean_terminated_length": 7528.7578125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.9977599084377289,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0033159854356199503,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 217485089.0,
+      "reward": 0.421875,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999412298202515,
+      "sampling/importance_sampling_ratio/min": 7.967943383846432e-05,
+      "sampling/sampling_logp_difference/max": 9.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.021925684064626694,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.8265397557115648e-05,
+      "clip_ratio/high_mean": 4.566349389278912e-06,
+      "clip_ratio/low_mean": 4.044636898470344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5012717691861326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15681.0,
+      "completions/mean_length": 7737.5546875,
+      "completions/mean_terminated_length": 7530.04052734375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.8667014688253403,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034952745772898197,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "num_tokens": 218496040.0,
+      "reward": 0.453125,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999128580093384,
+      "sampling/importance_sampling_ratio/min": 6.726370338583365e-05,
+      "sampling/sampling_logp_difference/max": 9.606889724731445,
+      "sampling/sampling_logp_difference/mean": 0.019742710515856743,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 8.244294804171659e-06,
+      "clip_ratio/high_mean": 2.0610737010429148e-06,
+      "clip_ratio/low_mean": 3.204250072030845e-05,
+      "clip_ratio/low_min": 3.323495775475749e-06,
+      "clip_ratio/region_mean": 3.410357436450795e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15858.0,
+      "completions/mean_length": 7365.84375,
+      "completions/mean_terminated_length": 6601.59326171875,
+      "completions/min_length": 744.0,
+      "completions/min_terminated_length": 744.0,
+      "entropy": 0.8151945173740387,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038676802068948746,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 219459140.0,
+      "reward": 0.46875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00023387260443996638,
+      "sampling/sampling_logp_difference/max": 8.360733985900879,
+      "sampling/sampling_logp_difference/mean": 0.018882082775235176,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 6.87833608026267e-06,
+      "clip_ratio/high_mean": 2.9462287329806713e-06,
+      "clip_ratio/low_mean": 5.435333650893881e-05,
+      "clip_ratio/low_min": 5.33937054569833e-06,
+      "clip_ratio/region_mean": 5.729956546929316e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 6448.0078125,
+      "completions/mean_terminated_length": 6369.771484375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9546648040413857,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004310046322643757,
+      "learning_rate": 1e-05,
+      "loss": 0.1082,
+      "num_tokens": 220304605.0,
+      "reward": 0.5703125,
+      "reward_std": 0.35611939430236816,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999396800994873,
+      "sampling/importance_sampling_ratio/min": 0.0001234127557836473,
+      "sampling/sampling_logp_difference/max": 8.99997615814209,
+      "sampling/sampling_logp_difference/mean": 0.020253397524356842,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 6.196094091137638e-06,
+      "clip_ratio/high_mean": 1.5490235227844096e-06,
+      "clip_ratio/low_mean": 2.5416685957679874e-05,
+      "clip_ratio/low_min": 5.5736391004757024e-06,
+      "clip_ratio/region_mean": 2.696570959415112e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 7457.6484375,
+      "completions/mean_terminated_length": 6941.24755859375,
+      "completions/min_length": 604.0,
+      "completions/min_terminated_length": 604.0,
+      "entropy": 0.8182889074087143,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026646999176591635,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 221281968.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2012200653553009,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173283576965,
+      "sampling/importance_sampling_ratio/min": 2.902353571698768e-06,
+      "sampling/sampling_logp_difference/max": 12.749988555908203,
+      "sampling/sampling_logp_difference/mean": 0.019208962097764015,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 1.6189535017474554e-05,
+      "clip_ratio/high_mean": 4.047383754368639e-06,
+      "clip_ratio/low_mean": 3.127787306311802e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.532525670379982e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 8561.109375,
+      "completions/mean_terminated_length": 7969.79052734375,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.9581378549337387,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016026750672608614,
+      "learning_rate": 1e-05,
+      "loss": 0.0131,
+      "num_tokens": 222399046.0,
+      "reward": 0.34375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 1.653693971093162e-06,
+      "sampling/sampling_logp_difference/max": 13.312499046325684,
+      "sampling/sampling_logp_difference/mean": 0.02173236384987831,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 1.4200771602190798e-05,
+      "clip_ratio/high_mean": 4.3255887476334465e-06,
+      "clip_ratio/low_mean": 5.2955770115659107e-05,
+      "clip_ratio/low_min": 3.402656830076012e-06,
+      "clip_ratio/region_mean": 5.7281358749605715e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16239.0,
+      "completions/mean_length": 7152.34375,
+      "completions/mean_terminated_length": 7079.6533203125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9052041247487068,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005460259038954973,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 223335010.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3356297016143799,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999966621398926,
+      "sampling/importance_sampling_ratio/min": 0.010161337442696095,
+      "sampling/sampling_logp_difference/max": 4.589165210723877,
+      "sampling/sampling_logp_difference/mean": 0.01986619457602501,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 1.4350314813782461e-05,
+      "clip_ratio/high_mean": 3.5875787034456152e-06,
+      "clip_ratio/low_mean": 3.81288905373367e-05,
+      "clip_ratio/low_min": 8.099272235995159e-06,
+      "clip_ratio/region_mean": 4.1716469809216505e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 6678.65625,
+      "completions/mean_terminated_length": 6524.603515625,
+      "completions/min_length": 963.0,
+      "completions/min_terminated_length": 963.0,
+      "entropy": 0.9043187350034714,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005933742038905621,
+      "learning_rate": 1e-05,
+      "loss": 0.0966,
+      "num_tokens": 224207006.0,
+      "reward": 0.484375,
+      "reward_std": 0.3316681981086731,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000031590461731,
+      "sampling/importance_sampling_ratio/min": 0.0011734943836927414,
+      "sampling/sampling_logp_difference/max": 6.747769355773926,
+      "sampling/sampling_logp_difference/mean": 0.019827336072921753,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 1.6498819377375185e-05,
+      "clip_ratio/high_mean": 4.124704844343796e-06,
+      "clip_ratio/low_mean": 3.601791678420341e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.014262168539062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6999.0390625,
+      "completions/mean_terminated_length": 6850.07177734375,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8109970837831497,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003635740838944912,
+      "learning_rate": 1e-05,
+      "loss": 0.104,
+      "num_tokens": 225122891.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999303817749023,
+      "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05,
+      "sampling/sampling_logp_difference/max": 10.987512588500977,
+      "sampling/sampling_logp_difference/mean": 0.018912551924586296,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 9.527577958579059e-06,
+      "clip_ratio/high_mean": 2.3818944896447647e-06,
+      "clip_ratio/low_mean": 3.766565987461945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.004755419373396e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15713.0,
+      "completions/mean_length": 7483.7109375,
+      "completions/mean_terminated_length": 7045.9912109375,
+      "completions/min_length": 1153.0,
+      "completions/min_terminated_length": 1153.0,
+      "entropy": 0.9473970532417297,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003405241761356592,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 226102462.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00002920627594,
+      "sampling/importance_sampling_ratio/min": 0.00525119062513113,
+      "sampling/sampling_logp_difference/max": 5.249300479888916,
+      "sampling/sampling_logp_difference/mean": 0.021076779812574387,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.5867321963014547e-05,
+      "clip_ratio/high_mean": 3.966830490753637e-06,
+      "clip_ratio/low_mean": 3.8259706570897833e-05,
+      "clip_ratio/low_min": 3.549019083948224e-06,
+      "clip_ratio/region_mean": 4.2226537743772496e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 7569.03125,
+      "completions/mean_terminated_length": 7357.47216796875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9231455475091934,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025927501264959574,
+      "learning_rate": 1e-05,
+      "loss": 0.0801,
+      "num_tokens": 227093562.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19097033143043518,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0052477638237178326,
+      "sampling/sampling_logp_difference/max": 5.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.020578444004058838,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.344091060673236e-05,
+      "clip_ratio/high_mean": 3.36022765168309e-06,
+      "clip_ratio/low_mean": 4.253613235505327e-05,
+      "clip_ratio/low_min": 3.5579084851633525e-06,
+      "clip_ratio/region_mean": 4.5896360120423196e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 7589.2734375,
+      "completions/mean_terminated_length": 7378.2001953125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9265239909291267,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030512227676808834,
+      "learning_rate": 1e-05,
+      "loss": 0.04,
+      "num_tokens": 228086405.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0002165911573683843,
+      "sampling/sampling_logp_difference/max": 8.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.020208362489938736,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.9613525410022703e-05,
+      "clip_ratio/high_mean": 4.903381352505676e-06,
+      "clip_ratio/low_mean": 3.184792547017423e-05,
+      "clip_ratio/low_min": 7.29296516510658e-06,
+      "clip_ratio/region_mean": 3.675130722058384e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 8420.6875,
+      "completions/mean_terminated_length": 8096.97509765625,
+      "completions/min_length": 1114.0,
+      "completions/min_terminated_length": 1114.0,
+      "entropy": 0.9572964608669281,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022430522367358208,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 229183765.0,
+      "reward": 0.34375,
+      "reward_std": 0.309583842754364,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 0.00029693738906644285,
+      "sampling/sampling_logp_difference/max": 8.121989250183105,
+      "sampling/sampling_logp_difference/mean": 0.021570362150669098,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.728750577167375e-06,
+      "clip_ratio/high_mean": 1.6821876442918438e-06,
+      "clip_ratio/low_mean": 2.1682553096979973e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.336474062758498e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15736.0,
+      "completions/mean_length": 6809.765625,
+      "completions/mean_terminated_length": 6579.984375,
+      "completions/min_length": 860.0,
+      "completions/min_terminated_length": 860.0,
+      "entropy": 0.884086549282074,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004295065999031067,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 230077607.0,
+      "reward": 0.484375,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00754612497985363,
+      "sampling/sampling_logp_difference/max": 4.886721134185791,
+      "sampling/sampling_logp_difference/mean": 0.019895706325769424,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 2.8609347509700456e-05,
+      "clip_ratio/high_mean": 7.152336877425114e-06,
+      "clip_ratio/low_mean": 5.158006410965754e-05,
+      "clip_ratio/low_min": 5.210069957684027e-06,
+      "clip_ratio/region_mean": 5.873240070286556e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15080.0,
+      "completions/mean_length": 7340.6953125,
+      "completions/mean_terminated_length": 6973.0810546875,
+      "completions/min_length": 1616.0,
+      "completions/min_terminated_length": 1616.0,
+      "entropy": 0.9920620769262314,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004631794057786465,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 231035616.0,
+      "reward": 0.4375,
+      "reward_std": 0.3235401213169098,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337792396545,
+      "sampling/importance_sampling_ratio/min": 0.0002508950710762292,
+      "sampling/sampling_logp_difference/max": 8.290475845336914,
+      "sampling/sampling_logp_difference/mean": 0.020591016858816147,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.3085940774290066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3085940774290066e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14120.0,
+      "completions/mean_length": 6748.875,
+      "completions/mean_terminated_length": 6595.93701171875,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.9867061004042625,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035752104595303535,
+      "learning_rate": 1e-05,
+      "loss": 0.0455,
+      "num_tokens": 231920056.0,
+      "reward": 0.40625,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999653100967407,
+      "sampling/importance_sampling_ratio/min": 0.0003869794018100947,
+      "sampling/sampling_logp_difference/max": 7.8571391105651855,
+      "sampling/sampling_logp_difference/mean": 0.02061416581273079,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 1.2506750408647349e-05,
+      "clip_ratio/high_mean": 3.1266876021618373e-06,
+      "clip_ratio/low_mean": 3.10397430212106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.416643085074611e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 7260.3046875,
+      "completions/mean_terminated_length": 7188.46435546875,
+      "completions/min_length": 1384.0,
+      "completions/min_terminated_length": 1384.0,
+      "entropy": 1.0388494208455086,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036644963547587395,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 232869159.0,
+      "reward": 0.390625,
+      "reward_std": 0.2359209954738617,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999546408653259,
+      "sampling/importance_sampling_ratio/min": 0.0008660226594656706,
+      "sampling/sampling_logp_difference/max": 7.051599502563477,
+      "sampling/sampling_logp_difference/mean": 0.02120530977845192,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.704355301830219e-05,
+      "clip_ratio/high_mean": 6.760888254575548e-06,
+      "clip_ratio/low_mean": 3.1861192269388994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.862208097871189e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16073.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 6354.4609375,
+      "completions/mean_terminated_length": 6354.4609375,
+      "completions/min_length": 1035.0,
+      "completions/min_terminated_length": 1035.0,
+      "entropy": 0.8405331820249557,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004709267523139715,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 233702842.0,
+      "reward": 0.546875,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 0.0046309432946145535,
+      "sampling/sampling_logp_difference/max": 5.37499475479126,
+      "sampling/sampling_logp_difference/mean": 0.019126038998365402,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 9.749228638611385e-06,
+      "clip_ratio/high_mean": 2.437307159652846e-06,
+      "clip_ratio/low_mean": 3.855073941849696e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.098804652130639e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16026.0,
+      "completions/mean_length": 6514.578125,
+      "completions/mean_terminated_length": 6357.9208984375,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "entropy": 1.0254098922014236,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003066045930609107,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 234556348.0,
+      "reward": 0.4375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805092811584,
+      "sampling/importance_sampling_ratio/min": 0.005210204049944878,
+      "sampling/sampling_logp_difference/max": 5.257136344909668,
+      "sampling/sampling_logp_difference/mean": 0.019960148259997368,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.0475813724042382e-05,
+      "clip_ratio/high_mean": 2.6189534310105955e-06,
+      "clip_ratio/low_mean": 3.487835761006863e-05,
+      "clip_ratio/low_min": 2.9392399483185727e-06,
+      "clip_ratio/region_mean": 3.749731081370555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 7379.5546875,
+      "completions/mean_terminated_length": 7236.62744140625,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 1.0397320613265038,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005132520105689764,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 235521091.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519364118576,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999256134033203,
+      "sampling/importance_sampling_ratio/min": 0.00016659013635944575,
+      "sampling/sampling_logp_difference/max": 8.699974060058594,
+      "sampling/sampling_logp_difference/mean": 0.021417103707790375,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.9904123973901733e-05,
+      "clip_ratio/high_mean": 5.776861314643611e-06,
+      "clip_ratio/low_mean": 2.6659268655748747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2436129686175263e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14565.0,
+      "completions/mean_length": 7837.1640625,
+      "completions/mean_terminated_length": 7632.04052734375,
+      "completions/min_length": 1346.0,
+      "completions/min_terminated_length": 1346.0,
+      "entropy": 0.8400963917374611,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028969801496714354,
+      "learning_rate": 1e-05,
+      "loss": 0.0143,
+      "num_tokens": 236544160.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887943267822,
+      "sampling/importance_sampling_ratio/min": 2.883308241052873e-07,
+      "sampling/sampling_logp_difference/max": 15.059157371520996,
+      "sampling/sampling_logp_difference/mean": 0.019267702475190163,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 8.562770290154731e-06,
+      "clip_ratio/high_mean": 2.1406925725386827e-06,
+      "clip_ratio/low_mean": 4.060094340729847e-05,
+      "clip_ratio/low_min": 3.8700886761944275e-06,
+      "clip_ratio/region_mean": 4.2741635979837156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15350.0,
+      "completions/mean_length": 6696.3515625,
+      "completions/mean_terminated_length": 6542.57958984375,
+      "completions/min_length": 1239.0,
+      "completions/min_terminated_length": 1239.0,
+      "entropy": 0.8495818004012108,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003412836929783225,
+      "learning_rate": 1e-05,
+      "loss": 0.0803,
+      "num_tokens": 237423101.0,
+      "reward": 0.515625,
+      "reward_std": 0.37981897592544556,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.012152798473834991,
+      "sampling/sampling_logp_difference/max": 4.410195827484131,
+      "sampling/sampling_logp_difference/mean": 0.018458625301718712,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 1.1463653436294408e-05,
+      "clip_ratio/high_mean": 3.646129641765583e-06,
+      "clip_ratio/low_mean": 6.144847083078275e-05,
+      "clip_ratio/low_min": 1.110105540647055e-05,
+      "clip_ratio/region_mean": 6.509460160941671e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15666.0,
+      "completions/mean_length": 7700.3671875,
+      "completions/mean_terminated_length": 7121.45849609375,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "entropy": 0.8258870914578438,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024443145375698805,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 238429956.0,
+      "reward": 0.375,
+      "reward_std": 0.2872493863105774,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999113082885742,
+      "sampling/importance_sampling_ratio/min": 0.00026112530031241477,
+      "sampling/sampling_logp_difference/max": 8.250510215759277,
+      "sampling/sampling_logp_difference/mean": 0.019427984952926636,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 4.218127742205979e-06,
+      "clip_ratio/high_mean": 1.0545319355514948e-06,
+      "clip_ratio/low_mean": 1.7289162997258245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.834369493280974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16112.0,
+      "completions/mean_length": 6255.21875,
+      "completions/mean_terminated_length": 6094.44482421875,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.8179014846682549,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022747826296836138,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 239250160.0,
+      "reward": 0.5234375,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.0002633975527714938,
+      "sampling/sampling_logp_difference/max": 8.241846084594727,
+      "sampling/sampling_logp_difference/mean": 0.018723051995038986,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 1.698448841125355e-05,
+      "clip_ratio/high_mean": 5.369374321162468e-06,
+      "clip_ratio/low_mean": 6.14647315160255e-05,
+      "clip_ratio/low_min": 5.043576493335422e-06,
+      "clip_ratio/region_mean": 6.683410583718796e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15321.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6914.9609375,
+      "completions/mean_terminated_length": 6914.9609375,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9700981751084328,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005685295443981886,
+      "learning_rate": 1e-05,
+      "loss": -0.0056,
+      "num_tokens": 240156211.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998887777328491,
+      "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05,
+      "sampling/sampling_logp_difference/max": 9.997581481933594,
+      "sampling/sampling_logp_difference/mean": 0.021195171400904655,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9186837764427764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9186837764427764e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15469.0,
+      "completions/mean_length": 5227.53125,
+      "completions/mean_terminated_length": 5139.68505859375,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.9116031974554062,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003880272386595607,
+      "learning_rate": 1e-05,
+      "loss": 0.1246,
+      "num_tokens": 240845295.0,
+      "reward": 0.6328125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000362396240234,
+      "sampling/importance_sampling_ratio/min": 0.00012422871077433228,
+      "sampling/sampling_logp_difference/max": 8.993386268615723,
+      "sampling/sampling_logp_difference/mean": 0.018801718950271606,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 2.5015486926349695e-05,
+      "clip_ratio/high_mean": 8.084949570275057e-06,
+      "clip_ratio/low_mean": 5.524710468307603e-05,
+      "clip_ratio/low_min": 3.776891389861703e-06,
+      "clip_ratio/region_mean": 6.333205465125502e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 8065.4765625,
+      "completions/mean_terminated_length": 7510.90869140625,
+      "completions/min_length": 1055.0,
+      "completions/min_terminated_length": 1055.0,
+      "entropy": 0.7446574792265892,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028986844699829817,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 241895676.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3474721610546112,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999842643737793,
+      "sampling/importance_sampling_ratio/min": 0.0017039099475368857,
+      "sampling/sampling_logp_difference/max": 6.3748297691345215,
+      "sampling/sampling_logp_difference/mean": 0.01853121444582939,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 9.486341014053323e-06,
+      "clip_ratio/high_mean": 2.371585253513331e-06,
+      "clip_ratio/low_mean": 2.896106741445692e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.133265261112683e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15534.0,
+      "completions/max_terminated_length": 15534.0,
+      "completions/mean_length": 6127.359375,
+      "completions/mean_terminated_length": 6127.359375,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.8569132760167122,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003845847910270095,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 242698258.0,
+      "reward": 0.53125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000942945480347,
+      "sampling/importance_sampling_ratio/min": 0.00043231461313553154,
+      "sampling/sampling_logp_difference/max": 7.746356964111328,
+      "sampling/sampling_logp_difference/mean": 0.01856958493590355,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 2.9848330086679198e-05,
+      "clip_ratio/high_mean": 7.4620825216697995e-06,
+      "clip_ratio/low_mean": 4.3558867673709756e-05,
+      "clip_ratio/low_min": 4.417741820361698e-06,
+      "clip_ratio/region_mean": 5.1020949285884853e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15192.0,
+      "completions/mean_length": 6600.1484375,
+      "completions/mean_terminated_length": 6365.33642578125,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.78924310952425,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003953634761273861,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 243560957.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.0006525487406179309,
+      "sampling/sampling_logp_difference/max": 7.334624767303467,
+      "sampling/sampling_logp_difference/mean": 0.018097909167408943,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 6.635561703660642e-06,
+      "clip_ratio/high_mean": 1.6588904259151604e-06,
+      "clip_ratio/low_mean": 2.737523408313791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9034124281679397e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7852.171875,
+      "completions/mean_terminated_length": 7852.171875,
+      "completions/min_length": 1276.0,
+      "completions/min_terminated_length": 1276.0,
+      "entropy": 1.0598893761634827,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00360781978815794,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 244585923.0,
+      "reward": 0.3125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05,
+      "sampling/sampling_logp_difference/max": 10.076086044311523,
+      "sampling/sampling_logp_difference/mean": 0.022330068051815033,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 3.1540168947685743e-06,
+      "clip_ratio/high_mean": 7.885042236921436e-07,
+      "clip_ratio/low_mean": 4.7973388973332476e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.876189268543385e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7972.2265625,
+      "completions/mean_terminated_length": 7700.87890625,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.933217465877533,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0027661293279379606,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 245628064.0,
+      "reward": 0.28125,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05,
+      "sampling/sampling_logp_difference/max": 10.366576194763184,
+      "sampling/sampling_logp_difference/mean": 0.021125148981809616,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.2965969062861404e-05,
+      "clip_ratio/high_mean": 3.241492265715351e-06,
+      "clip_ratio/low_mean": 4.6317693090713874e-05,
+      "clip_ratio/low_min": 3.820877282123547e-06,
+      "clip_ratio/region_mean": 4.955918507221213e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15744.0,
+      "completions/mean_length": 7135.6953125,
+      "completions/mean_terminated_length": 6913.736328125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 0.7786942347884178,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005680318456143141,
+      "learning_rate": 1e-05,
+      "loss": 0.0786,
+      "num_tokens": 246561329.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999462366104126,
+      "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05,
+      "sampling/sampling_logp_difference/max": 9.737424850463867,
+      "sampling/sampling_logp_difference/mean": 0.018504241481423378,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.22437145175536e-05,
+      "clip_ratio/low_min": 1.4025082009538892e-05,
+      "clip_ratio/region_mean": 4.22437145175536e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 6704.046875,
+      "completions/mean_terminated_length": 6627.82666015625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "entropy": 1.0435140281915665,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026402862276881933,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 247437415.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31276631355285645,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998904466629028,
+      "sampling/importance_sampling_ratio/min": 0.0007800163584761322,
+      "sampling/sampling_logp_difference/max": 7.156195640563965,
+      "sampling/sampling_logp_difference/mean": 0.02134273201227188,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 2.223430897174694e-05,
+      "clip_ratio/high_mean": 6.8746438159905665e-06,
+      "clip_ratio/low_mean": 4.7084630978133646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3959275192028144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 5892.5078125,
+      "completions/mean_terminated_length": 5725.9765625,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "entropy": 0.8004944771528244,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003993614576756954,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 248211112.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0024652592837810516,
+      "sampling/sampling_logp_difference/max": 6.005458354949951,
+      "sampling/sampling_logp_difference/mean": 0.01924925297498703,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 2.1833082200828358e-05,
+      "clip_ratio/high_mean": 5.458270550207089e-06,
+      "clip_ratio/low_mean": 3.415995615796419e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961822596920683e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 7812.140625,
+      "completions/mean_terminated_length": 7316.24755859375,
+      "completions/min_length": 1515.0,
+      "completions/min_terminated_length": 1515.0,
+      "entropy": 0.8841542899608612,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001573400106281042,
+      "learning_rate": 1e-05,
+      "loss": 0.0823,
+      "num_tokens": 249228106.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 0.001001527882181108,
+      "sampling/sampling_logp_difference/max": 6.906228542327881,
+      "sampling/sampling_logp_difference/mean": 0.01956877112388611,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 1.014439021673752e-05,
+      "clip_ratio/high_mean": 2.53609755418438e-06,
+      "clip_ratio/low_mean": 3.068193461785995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.321803217204433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 6372.953125,
+      "completions/mean_terminated_length": 6132.6884765625,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.8228401988744736,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021125099156051874,
+      "learning_rate": 1e-05,
+      "loss": 0.0438,
+      "num_tokens": 250063284.0,
+      "reward": 0.5,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05,
+      "sampling/sampling_logp_difference/max": 9.937475204467773,
+      "sampling/sampling_logp_difference/mean": 0.01943521574139595,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 7.023906164249638e-06,
+      "clip_ratio/high_mean": 1.7559765410624095e-06,
+      "clip_ratio/low_mean": 2.526416994896863e-05,
+      "clip_ratio/low_min": 6.7760895490209805e-06,
+      "clip_ratio/region_mean": 2.7020146660561295e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16270.0,
+      "completions/mean_length": 7817.8671875,
+      "completions/mean_terminated_length": 7396.58154296875,
+      "completions/min_length": 1568.0,
+      "completions/min_terminated_length": 1568.0,
+      "entropy": 0.9454319775104523,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022315154783427715,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 251085123.0,
+      "reward": 0.40625,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06,
+      "sampling/sampling_logp_difference/max": 12.760490417480469,
+      "sampling/sampling_logp_difference/mean": 0.021764669567346573,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 1.4797966287005693e-05,
+      "clip_ratio/high_mean": 3.699491571751423e-06,
+      "clip_ratio/low_mean": 4.36271948274225e-05,
+      "clip_ratio/low_min": 3.6957101201551268e-06,
+      "clip_ratio/region_mean": 4.732668639917392e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 7168.4921875,
+      "completions/mean_terminated_length": 6635.36328125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8433891162276268,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 252020906.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589920043945,
+      "sampling/importance_sampling_ratio/min": 0.0003851866349577904,
+      "sampling/sampling_logp_difference/max": 7.861782550811768,
+      "sampling/sampling_logp_difference/mean": 0.01929781585931778,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 1.996871560550062e-05,
+      "clip_ratio/high_mean": 6.089093403716106e-06,
+      "clip_ratio/low_mean": 4.2792244585143635e-05,
+      "clip_ratio/low_min": 1.0337215371691855e-05,
+      "clip_ratio/region_mean": 4.8881338216233416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7322.5078125,
+      "completions/mean_terminated_length": 6876.8603515625,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 0.9157031401991844,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036942458245903254,
+      "learning_rate": 1e-05,
+      "loss": 0.079,
+      "num_tokens": 252977435.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24275577068328857,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999804496765137,
+      "sampling/importance_sampling_ratio/min": 0.00029605376766994596,
+      "sampling/sampling_logp_difference/max": 8.124969482421875,
+      "sampling/sampling_logp_difference/mean": 0.0205365102738142,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.631919460327481e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.631919460327481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16078.0,
+      "completions/mean_length": 7025.484375,
+      "completions/mean_terminated_length": 6723.5966796875,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 1.1329731941223145,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034127074759453535,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 253896161.0,
+      "reward": 0.25,
+      "reward_std": 0.27722424268722534,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0005197672289796174,
+      "sampling/sampling_logp_difference/max": 7.562129497528076,
+      "sampling/sampling_logp_difference/mean": 0.023741140961647034,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 4.368643658381188e-06,
+      "clip_ratio/high_mean": 1.092160914595297e-06,
+      "clip_ratio/low_mean": 2.4661783299961826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5753944555617636e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13776.0,
+      "completions/mean_length": 5996.1796875,
+      "completions/mean_terminated_length": 5661.08837890625,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8773328885436058,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003959407564252615,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 254690264.0,
+      "reward": 0.53125,
+      "reward_std": 0.26645541191101074,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07,
+      "sampling/sampling_logp_difference/max": 15.73043155670166,
+      "sampling/sampling_logp_difference/mean": 0.018407585099339485,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.616483677935321e-05,
+      "clip_ratio/high_mean": 4.041209194838302e-06,
+      "clip_ratio/low_mean": 3.736187466074625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140308453770558e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16383.0,
+      "completions/mean_length": 7165.328125,
+      "completions/mean_terminated_length": 6867.951171875,
+      "completions/min_length": 1115.0,
+      "completions/min_terminated_length": 1115.0,
+      "entropy": 0.9502597972750664,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030910037457942963,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 255626394.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000731945037842,
+      "sampling/importance_sampling_ratio/min": 0.00022311302018351853,
+      "sampling/sampling_logp_difference/max": 8.407832145690918,
+      "sampling/sampling_logp_difference/mean": 0.020668907091021538,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.1702686606440693e-05,
+      "clip_ratio/high_mean": 2.9256716516101733e-06,
+      "clip_ratio/low_mean": 5.5247357522603124e-05,
+      "clip_ratio/low_min": 3.6811261452385224e-06,
+      "clip_ratio/region_mean": 5.8173028264718596e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15375.0,
+      "completions/mean_length": 8001.9296875,
+      "completions/mean_terminated_length": 7661.34912109375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "entropy": 0.8591345250606537,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037233952898532152,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 256673457.0,
+      "reward": 0.421875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999151229858398,
+      "sampling/importance_sampling_ratio/min": 0.0021876997780054808,
+      "sampling/sampling_logp_difference/max": 6.124904632568359,
+      "sampling/sampling_logp_difference/mean": 0.020540472120046616,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 3.721341136042611e-05,
+      "clip_ratio/high_mean": 1.2759249216287571e-05,
+      "clip_ratio/low_mean": 3.570647322703735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.846572301175911e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 6924.84375,
+      "completions/mean_terminated_length": 6697.82421875,
+      "completions/min_length": 803.0,
+      "completions/min_terminated_length": 803.0,
+      "entropy": 0.7969356626272202,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006054217461496592,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 257578501.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.007889713160693645,
+      "sampling/sampling_logp_difference/max": 4.842195510864258,
+      "sampling/sampling_logp_difference/mean": 0.019306108355522156,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.0211543894911301e-05,
+      "clip_ratio/high_mean": 2.5528859737278253e-06,
+      "clip_ratio/low_mean": 5.2388056587915344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4940942732173426e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14439.0,
+      "completions/mean_length": 6203.03125,
+      "completions/mean_terminated_length": 5958.6884765625,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "entropy": 0.8734413683414459,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004903806839138269,
+      "learning_rate": 1e-05,
+      "loss": 0.0689,
+      "num_tokens": 258392625.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999826550483704,
+      "sampling/importance_sampling_ratio/min": 0.00020370795391499996,
+      "sampling/sampling_logp_difference/max": 8.498823165893555,
+      "sampling/sampling_logp_difference/mean": 0.01909301057457924,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.5135058674786706e-05,
+      "clip_ratio/high_mean": 4.64845766146027e-06,
+      "clip_ratio/low_mean": 4.373456977191381e-05,
+      "clip_ratio/low_min": 3.670856358439778e-06,
+      "clip_ratio/region_mean": 4.8383026296505705e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 7982.5390625,
+      "completions/mean_terminated_length": 7641.01611328125,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0091779381036758,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033637424930930138,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 259435270.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999765753746033,
+      "sampling/importance_sampling_ratio/min": 0.0016514655435457826,
+      "sampling/sampling_logp_difference/max": 6.406092166900635,
+      "sampling/sampling_logp_difference/mean": 0.02182736061513424,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 2.3964702677403693e-05,
+      "clip_ratio/high_mean": 5.991175669350923e-06,
+      "clip_ratio/low_mean": 5.2442986770984135e-05,
+      "clip_ratio/low_min": 8.75736759553547e-06,
+      "clip_ratio/region_mean": 5.843416238349164e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6915.3125,
+      "completions/mean_terminated_length": 6688.064453125,
+      "completions/min_length": 778.0,
+      "completions/min_terminated_length": 778.0,
+      "entropy": 0.7964543774724007,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052203768864274025,
+      "learning_rate": 1e-05,
+      "loss": 0.144,
+      "num_tokens": 260337614.0,
+      "reward": 0.46875,
+      "reward_std": 0.37928223609924316,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 7.032832218101248e-05,
+      "sampling/sampling_logp_difference/max": 9.562335968017578,
+      "sampling/sampling_logp_difference/mean": 0.017896221950650215,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 4.458271632756805e-05,
+      "clip_ratio/high_mean": 1.1145679081892013e-05,
+      "clip_ratio/low_mean": 6.243192206056847e-05,
+      "clip_ratio/low_min": 1.2397775662975619e-05,
+      "clip_ratio/region_mean": 7.357759886872373e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7029.4375,
+      "completions/mean_terminated_length": 6880.95263671875,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "entropy": 0.8605096861720085,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005570738110691309,
+      "learning_rate": 1e-05,
+      "loss": 0.0984,
+      "num_tokens": 261254070.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3327290117740631,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999494552612305,
+      "sampling/importance_sampling_ratio/min": 0.0009070249507203698,
+      "sampling/sampling_logp_difference/max": 7.005340576171875,
+      "sampling/sampling_logp_difference/mean": 0.01905740052461624,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 3.390461233720998e-05,
+      "clip_ratio/high_mean": 1.1191766247975465e-05,
+      "clip_ratio/low_mean": 7.46641262594494e-05,
+      "clip_ratio/low_min": 5.041745680500753e-06,
+      "clip_ratio/region_mean": 8.585589102949598e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5858.84375,
+      "completions/mean_terminated_length": 5606.240234375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.8430554121732712,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004496110137552023,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 262024906.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294877052307,
+      "sampling/importance_sampling_ratio/min": 0.00040469475788995624,
+      "sampling/sampling_logp_difference/max": 7.812377452850342,
+      "sampling/sampling_logp_difference/mean": 0.019225869327783585,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 3.2563955301156966e-06,
+      "clip_ratio/high_mean": 8.140988825289242e-07,
+      "clip_ratio/low_mean": 3.7080020149460324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.789411886145899e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15976.0,
+      "completions/mean_length": 8337.328125,
+      "completions/mean_terminated_length": 7728.7568359375,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.901745393872261,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00348713924176991,
+      "learning_rate": 1e-05,
+      "loss": -0.0002,
+      "num_tokens": 263110844.0,
+      "reward": 0.296875,
+      "reward_std": 0.20805485546588898,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998900890350342,
+      "sampling/importance_sampling_ratio/min": 0.0022652465850114822,
+      "sampling/sampling_logp_difference/max": 6.090071678161621,
+      "sampling/sampling_logp_difference/mean": 0.02157524600625038,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 2.3739744847262045e-05,
+      "clip_ratio/high_mean": 5.934936211815511e-06,
+      "clip_ratio/low_mean": 2.823553325015382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.417046866616147e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 7084.7265625,
+      "completions/mean_terminated_length": 6381.42041015625,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8265534415841103,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003980033565312624,
+      "learning_rate": 1e-05,
+      "loss": 0.0551,
+      "num_tokens": 264036169.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673366546631,
+      "sampling/importance_sampling_ratio/min": 0.00012345099821686745,
+      "sampling/sampling_logp_difference/max": 8.999666213989258,
+      "sampling/sampling_logp_difference/mean": 0.018782664090394974,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 1.1745505617000163e-05,
+      "clip_ratio/high_mean": 3.771558226617344e-06,
+      "clip_ratio/low_mean": 6.913120819262986e-05,
+      "clip_ratio/low_min": 2.494283216947224e-05,
+      "clip_ratio/region_mean": 7.290276607818669e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6543.796875,
+      "completions/mean_terminated_length": 6543.796875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8899869695305824,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.006467343773692846,
+      "learning_rate": 1e-05,
+      "loss": 0.1139,
+      "num_tokens": 264892767.0,
+      "reward": 0.484375,
+      "reward_std": 0.3934885561466217,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000489950180054,
+      "sampling/importance_sampling_ratio/min": 9.891482477542013e-05,
+      "sampling/sampling_logp_difference/max": 9.221251487731934,
+      "sampling/sampling_logp_difference/mean": 0.02032080665230751,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.395576979732141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.395576979732141e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16307.0,
+      "completions/mean_length": 8483.390625,
+      "completions/mean_terminated_length": 7813.84765625,
+      "completions/min_length": 1342.0,
+      "completions/min_terminated_length": 1342.0,
+      "entropy": 0.9621479511260986,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003174177836626768,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 265995697.0,
+      "reward": 0.3359375,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.0005628522485494614,
+      "sampling/sampling_logp_difference/max": 7.4824934005737305,
+      "sampling/sampling_logp_difference/mean": 0.02145479805767536,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.2596524811669951e-05,
+      "clip_ratio/high_mean": 3.149131202917488e-06,
+      "clip_ratio/low_mean": 3.7911659774181317e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.106079018129094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14985.0,
+      "completions/mean_length": 7184.578125,
+      "completions/mean_terminated_length": 6963.79248046875,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.9993807673454285,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003356153378263116,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 266937707.0,
+      "reward": 0.3828125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000238418579102,
+      "sampling/importance_sampling_ratio/min": 0.0017036627978086472,
+      "sampling/sampling_logp_difference/max": 6.374974727630615,
+      "sampling/sampling_logp_difference/mean": 0.02204768732190132,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 1.9245163684900035e-05,
+      "clip_ratio/high_mean": 4.811290921225009e-06,
+      "clip_ratio/low_mean": 4.8845648166206956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.365693925796222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16216.0,
+      "completions/mean_length": 7029.2265625,
+      "completions/mean_terminated_length": 6727.45947265625,
+      "completions/min_length": 851.0,
+      "completions/min_terminated_length": 851.0,
+      "entropy": 0.9139953926205635,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006375293247401714,
+      "learning_rate": 1e-05,
+      "loss": 0.0519,
+      "num_tokens": 267853880.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.010649868287146091,
+      "sampling/sampling_logp_difference/max": 4.542207717895508,
+      "sampling/sampling_logp_difference/mean": 0.020365029573440552,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 4.812504812434781e-06,
+      "clip_ratio/high_mean": 1.2031262031086953e-06,
+      "clip_ratio/low_mean": 2.5999243803198624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.720237000630732e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6188.0078125,
+      "completions/mean_terminated_length": 5943.30419921875,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.7640773430466652,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003697809297591448,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 268665721.0,
+      "reward": 0.5078125,
+      "reward_std": 0.20699402689933777,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372363090515,
+      "sampling/importance_sampling_ratio/min": 0.02927250787615776,
+      "sampling/sampling_logp_difference/max": 3.531106472015381,
+      "sampling/sampling_logp_difference/mean": 0.016581017524003983,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1358927824621787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1358927824621787e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 8128.21875,
+      "completions/mean_terminated_length": 7861.90283203125,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.8218234181404114,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002286596456542611,
+      "learning_rate": 1e-05,
+      "loss": 0.0763,
+      "num_tokens": 269726181.0,
+      "reward": 0.375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999798536300659,
+      "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06,
+      "sampling/sampling_logp_difference/max": 12.90043830871582,
+      "sampling/sampling_logp_difference/mean": 0.019403984770178795,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 1.4808477317274082e-05,
+      "clip_ratio/high_mean": 3.7021193293185206e-06,
+      "clip_ratio/low_mean": 3.0363167581981543e-05,
+      "clip_ratio/low_min": 6.364238288369961e-06,
+      "clip_ratio/region_mean": 3.4065286854456645e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 5673.3359375,
+      "completions/mean_terminated_length": 5503.32568359375,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.9275510385632515,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00485506234690547,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 270470616.0,
+      "reward": 0.4921875,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.0009123464697040617,
+      "sampling/sampling_logp_difference/max": 6.999490737915039,
+      "sampling/sampling_logp_difference/mean": 0.01881871558725834,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 1.1274602456978755e-05,
+      "clip_ratio/high_mean": 3.6739949109687586e-06,
+      "clip_ratio/low_mean": 3.968570712231667e-05,
+      "clip_ratio/low_min": 3.4213767321489286e-06,
+      "clip_ratio/region_mean": 4.335970191959859e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 6944.8984375,
+      "completions/mean_terminated_length": 6795.07177734375,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9335741624236107,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005874342750757933,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 271377723.0,
+      "reward": 0.390625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000594854354858,
+      "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05,
+      "sampling/sampling_logp_difference/max": 10.049861907958984,
+      "sampling/sampling_logp_difference/mean": 0.020590776577591896,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 1.264126694877632e-05,
+      "clip_ratio/high_mean": 3.16031673719408e-06,
+      "clip_ratio/low_mean": 3.206376845810155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.522408474054828e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 7705.625,
+      "completions/mean_terminated_length": 7278.8193359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.8491624072194099,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001684082904830575,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 272384891.0,
+      "reward": 0.390625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 6.605865200981498e-05,
+      "sampling/sampling_logp_difference/max": 9.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.020136822015047073,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 9.772357770998497e-06,
+      "clip_ratio/high_mean": 2.443089442749624e-06,
+      "clip_ratio/low_mean": 3.8573590472879005e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.101667946088128e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6611.1484375,
+      "completions/mean_terminated_length": 6534.19677734375,
+      "completions/min_length": 1116.0,
+      "completions/min_terminated_length": 1116.0,
+      "entropy": 0.8867302760481834,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003692191792652011,
+      "learning_rate": 1e-05,
+      "loss": 0.1233,
+      "num_tokens": 273251630.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999606609344482,
+      "sampling/importance_sampling_ratio/min": 0.0031062732450664043,
+      "sampling/sampling_logp_difference/max": 5.774331569671631,
+      "sampling/sampling_logp_difference/mean": 0.019237037748098373,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 3.0103737344688852e-05,
+      "clip_ratio/high_mean": 9.664363972206047e-06,
+      "clip_ratio/low_mean": 1.7575501146893657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.723986426644842e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15786.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 6770.46875,
+      "completions/mean_terminated_length": 6770.46875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.8252957463264465,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004167635925114155,
+      "learning_rate": 1e-05,
+      "loss": -0.0072,
+      "num_tokens": 274146482.0,
+      "reward": 0.5703125,
+      "reward_std": 0.23486016690731049,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.00010247006866848096,
+      "sampling/sampling_logp_difference/max": 9.18593978881836,
+      "sampling/sampling_logp_difference/mean": 0.019684650003910065,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 6.529460733872838e-06,
+      "clip_ratio/high_mean": 1.6323651834682096e-06,
+      "clip_ratio/low_mean": 3.877351048231503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.040587566578324e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15827.0,
+      "completions/mean_length": 8210.859375,
+      "completions/mean_terminated_length": 7365.36181640625,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.8118235394358635,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030363225378096104,
+      "learning_rate": 1e-05,
+      "loss": 0.0531,
+      "num_tokens": 275214040.0,
+      "reward": 0.3515625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998943209648132,
+      "sampling/importance_sampling_ratio/min": 0.002854935359209776,
+      "sampling/sampling_logp_difference/max": 5.858705997467041,
+      "sampling/sampling_logp_difference/mean": 0.019275270402431488,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.0800629146106075e-06,
+      "clip_ratio/high_mean": 1.7700157286526519e-06,
+      "clip_ratio/low_mean": 2.3981688286767167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5751703674359305e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14900.0,
+      "completions/mean_length": 7072.8828125,
+      "completions/mean_terminated_length": 6849.41650390625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8018335327506065,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004777858033776283,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 276138049.0,
+      "reward": 0.453125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368190765381,
+      "sampling/importance_sampling_ratio/min": 0.0028502768836915493,
+      "sampling/sampling_logp_difference/max": 5.860339164733887,
+      "sampling/sampling_logp_difference/mean": 0.01849908009171486,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 2.259368602608447e-05,
+      "clip_ratio/high_mean": 5.648421506521117e-06,
+      "clip_ratio/low_mean": 4.28424866640853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.849090737479855e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14447.0,
+      "completions/mean_length": 5889.8359375,
+      "completions/mean_terminated_length": 5723.26220703125,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.7976400703191757,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030593445990234613,
+      "learning_rate": 1e-05,
+      "loss": 0.1331,
+      "num_tokens": 276910124.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999091029167175,
+      "sampling/importance_sampling_ratio/min": 0.000139843366923742,
+      "sampling/sampling_logp_difference/max": 8.874987602233887,
+      "sampling/sampling_logp_difference/mean": 0.01834402233362198,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 1.4654247024736833e-05,
+      "clip_ratio/high_mean": 3.663561756184208e-06,
+      "clip_ratio/low_mean": 2.377464920755301e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7438210736363544e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 7144.265625,
+      "completions/mean_terminated_length": 6689.85205078125,
+      "completions/min_length": 1200.0,
+      "completions/min_terminated_length": 1200.0,
+      "entropy": 0.8309404999017715,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004245694726705551,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 277843542.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998534321784973,
+      "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05,
+      "sampling/sampling_logp_difference/max": 11.499897956848145,
+      "sampling/sampling_logp_difference/mean": 0.01875344291329384,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 6.252500952541595e-06,
+      "clip_ratio/high_mean": 2.241558604509919e-06,
+      "clip_ratio/low_mean": 4.735765514851664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9599213525652885e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15722.0,
+      "completions/mean_length": 6779.5234375,
+      "completions/mean_terminated_length": 6703.8974609375,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.9584890529513359,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035574575886130333,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 278730129.0,
+      "reward": 0.3984375,
+      "reward_std": 0.32825323939323425,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.005792221520096064,
+      "sampling/sampling_logp_difference/max": 5.151239395141602,
+      "sampling/sampling_logp_difference/mean": 0.02137477695941925,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 3.2948471016425174e-05,
+      "clip_ratio/high_mean": 9.518853403278627e-06,
+      "clip_ratio/low_mean": 2.195712454522436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.14759782895635e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15892.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 5582.9765625,
+      "completions/mean_terminated_length": 5582.9765625,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8629376217722893,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037982752546668053,
+      "learning_rate": 1e-05,
+      "loss": 0.0331,
+      "num_tokens": 279462542.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3164186477661133,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999780058860779,
+      "sampling/importance_sampling_ratio/min": 0.0021874974481761456,
+      "sampling/sampling_logp_difference/max": 6.124997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01906203106045723,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.1029473625967512e-05,
+      "clip_ratio/high_mean": 2.757368406491878e-06,
+      "clip_ratio/low_mean": 5.367386921761863e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6431237737797346e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 6942.2578125,
+      "completions/mean_terminated_length": 6477.90966796875,
+      "completions/min_length": 1156.0,
+      "completions/min_terminated_length": 1156.0,
+      "entropy": 0.8147861957550049,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027678858023136854,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 280370207.0,
+      "reward": 0.4375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998471736907959,
+      "sampling/importance_sampling_ratio/min": 0.00023058800434228033,
+      "sampling/sampling_logp_difference/max": 8.3748779296875,
+      "sampling/sampling_logp_difference/mean": 0.01940828748047352,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 2.6367894406575942e-05,
+      "clip_ratio/high_mean": 8.765707434577052e-06,
+      "clip_ratio/low_mean": 3.232976985145797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.109547796815605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6242.53125,
+      "completions/mean_terminated_length": 5915.38671875,
+      "completions/min_length": 1220.0,
+      "completions/min_terminated_length": 1220.0,
+      "entropy": 0.878915011882782,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00577945914119482,
+      "learning_rate": 1e-05,
+      "loss": 0.0839,
+      "num_tokens": 281189491.0,
+      "reward": 0.515625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 9.611724817659706e-05,
+      "sampling/sampling_logp_difference/max": 9.2499418258667,
+      "sampling/sampling_logp_difference/mean": 0.01948760263621807,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 3.50839609382092e-05,
+      "clip_ratio/high_mean": 1.1664920634757436e-05,
+      "clip_ratio/low_mean": 1.833109013205103e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9996010880495305e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 7004.015625,
+      "completions/mean_terminated_length": 6622.71533203125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "entropy": 0.7964659407734871,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014128695474937558,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 282103997.0,
+      "reward": 0.4140625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.0024504722096025944,
+      "sampling/sampling_logp_difference/max": 6.011474609375,
+      "sampling/sampling_logp_difference/mean": 0.019019678235054016,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.832260545597819e-05,
+      "clip_ratio/high_mean": 4.580651363994548e-06,
+      "clip_ratio/low_mean": 5.309064226821647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.767129368905444e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7822.6953125,
+      "completions/mean_terminated_length": 7546.52392578125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.8571138679981232,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002476039342582226,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 283122382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999314546585083,
+      "sampling/importance_sampling_ratio/min": 0.0009774373611435294,
+      "sampling/sampling_logp_difference/max": 6.930576324462891,
+      "sampling/sampling_logp_difference/mean": 0.020557202398777008,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 5.738419986300869e-06,
+      "clip_ratio/high_mean": 1.4346049965752172e-06,
+      "clip_ratio/low_mean": 4.19679121819172e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3402517292179255e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7738.8984375,
+      "completions/mean_terminated_length": 6844.57763671875,
+      "completions/min_length": 897.0,
+      "completions/min_terminated_length": 897.0,
+      "entropy": 0.7839021533727646,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005309853237122297,
+      "learning_rate": 1e-05,
+      "loss": 0.043,
+      "num_tokens": 284130081.0,
+      "reward": 0.5234375,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998971223831177,
+      "sampling/importance_sampling_ratio/min": 0.0001319014554610476,
+      "sampling/sampling_logp_difference/max": 8.933455467224121,
+      "sampling/sampling_logp_difference/mean": 0.01873316988348961,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 1.007085802484653e-05,
+      "clip_ratio/high_mean": 2.5177145062116324e-06,
+      "clip_ratio/low_mean": 4.043528815600439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.295300277590286e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15952.0,
+      "completions/mean_length": 7102.2421875,
+      "completions/mean_terminated_length": 6954.9130859375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.8530801385641098,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004228116944432259,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 285058720.0,
+      "reward": 0.5078125,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00012956927821505815,
+      "sampling/sampling_logp_difference/max": 8.951294898986816,
+      "sampling/sampling_logp_difference/mean": 0.019325006753206253,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 4.06874551117653e-06,
+      "clip_ratio/high_mean": 1.0171863777941326e-06,
+      "clip_ratio/low_mean": 3.661125703047219e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.762844340826632e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15594.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6583.4765625,
+      "completions/mean_terminated_length": 6583.4765625,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 1.021921381354332,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004967439454048872,
+      "learning_rate": 1e-05,
+      "loss": 0.0374,
+      "num_tokens": 285919765.0,
+      "reward": 0.328125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00004243850708,
+      "sampling/importance_sampling_ratio/min": 0.016675354912877083,
+      "sampling/sampling_logp_difference/max": 4.093823432922363,
+      "sampling/sampling_logp_difference/mean": 0.021393200382590294,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.2215251445013564e-05,
+      "clip_ratio/high_mean": 3.053812861253391e-06,
+      "clip_ratio/low_mean": 4.05305947879242e-05,
+      "clip_ratio/low_min": 4.215567059873138e-06,
+      "clip_ratio/region_mean": 4.358440742180392e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16299.0,
+      "completions/mean_length": 7770.5859375,
+      "completions/mean_terminated_length": 7346.97509765625,
+      "completions/min_length": 1040.0,
+      "completions/min_terminated_length": 1040.0,
+      "entropy": 1.0466903448104858,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004189736675471067,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 286935512.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2369818240404129,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797344207764,
+      "sampling/importance_sampling_ratio/min": 0.011683559976518154,
+      "sampling/sampling_logp_difference/max": 4.449572563171387,
+      "sampling/sampling_logp_difference/mean": 0.021805983036756516,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 2.0567378214764176e-05,
+      "clip_ratio/high_mean": 5.141844553691044e-06,
+      "clip_ratio/low_mean": 1.8177100628236076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3318944840866607e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15758.0,
+      "completions/mean_length": 5689.2421875,
+      "completions/mean_terminated_length": 5432.568359375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.7778806164860725,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0032866497058421373,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 287681943.0,
+      "reward": 0.640625,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940812587738,
+      "sampling/importance_sampling_ratio/min": 0.00038077132194302976,
+      "sampling/sampling_logp_difference/max": 7.873311519622803,
+      "sampling/sampling_logp_difference/mean": 0.01789461076259613,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 3.109086901531555e-05,
+      "clip_ratio/high_mean": 7.772717253828887e-06,
+      "clip_ratio/low_mean": 3.1423560130861006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.919627738468989e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13820.0,
+      "completions/mean_length": 6288.1875,
+      "completions/mean_terminated_length": 6127.93701171875,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.7709921672940254,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023572889622300863,
+      "learning_rate": 1e-05,
+      "loss": 0.0746,
+      "num_tokens": 288506735.0,
+      "reward": 0.484375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474287033081,
+      "sampling/importance_sampling_ratio/min": 0.000430915504693985,
+      "sampling/sampling_logp_difference/max": 7.749598503112793,
+      "sampling/sampling_logp_difference/mean": 0.017407266423106194,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 3.4638953366084024e-05,
+      "clip_ratio/high_mean": 9.51674803673086e-06,
+      "clip_ratio/low_mean": 6.26047980176736e-05,
+      "clip_ratio/low_min": 5.51267930859467e-06,
+      "clip_ratio/region_mean": 7.212154741864651e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 6775.0234375,
+      "completions/mean_terminated_length": 6465.05615234375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9338318258523941,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034220058005303144,
+      "learning_rate": 1e-05,
+      "loss": 0.0986,
+      "num_tokens": 289395498.0,
+      "reward": 0.390625,
+      "reward_std": 0.34533774852752686,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603033065796,
+      "sampling/importance_sampling_ratio/min": 0.0317598432302475,
+      "sampling/sampling_logp_difference/max": 3.449552536010742,
+      "sampling/sampling_logp_difference/mean": 0.019930530339479446,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 7.159989991123439e-05,
+      "clip_ratio/low_min": 1.5592839645250933e-05,
+      "clip_ratio/region_mean": 7.159989991123439e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 7142.9375,
+      "completions/mean_terminated_length": 6844.83837890625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 0.971405878663063,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002513247774913907,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 290329082.0,
+      "reward": 0.328125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999737739562988,
+      "sampling/importance_sampling_ratio/min": 3.152207455059397e-07,
+      "sampling/sampling_logp_difference/max": 14.969992637634277,
+      "sampling/sampling_logp_difference/mean": 0.022366533055901527,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 1.6507752206962323e-05,
+      "clip_ratio/high_mean": 4.126938051740581e-06,
+      "clip_ratio/low_mean": 1.7493430505055585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1620368215735652e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15581.0,
+      "completions/mean_length": 6412.2109375,
+      "completions/mean_terminated_length": 6333.69287109375,
+      "completions/min_length": 544.0,
+      "completions/min_terminated_length": 544.0,
+      "entropy": 0.9136044681072235,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0056767817586660385,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 291170133.0,
+      "reward": 0.421875,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999720454216003,
+      "sampling/importance_sampling_ratio/min": 0.000458698661532253,
+      "sampling/sampling_logp_difference/max": 7.687117099761963,
+      "sampling/sampling_logp_difference/mean": 0.020012658089399338,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 8.26085442895419e-06,
+      "clip_ratio/high_mean": 2.0652136072385474e-06,
+      "clip_ratio/low_mean": 3.6938338666914206e-05,
+      "clip_ratio/low_min": 5.699044777429663e-06,
+      "clip_ratio/region_mean": 3.900355193309224e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16111.0,
+      "completions/mean_length": 8066.1015625,
+      "completions/mean_terminated_length": 7797.7822265625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 1.0789504647254944,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00243841833434999,
+      "learning_rate": 1e-05,
+      "loss": 0.0432,
+      "num_tokens": 292222082.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999664425849915,
+      "sampling/importance_sampling_ratio/min": 8.481895929435268e-05,
+      "sampling/sampling_logp_difference/max": 9.374991416931152,
+      "sampling/sampling_logp_difference/mean": 0.023650091141462326,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 5.320054697222076e-06,
+      "clip_ratio/high_mean": 1.330013674305519e-06,
+      "clip_ratio/low_mean": 1.9117383317279746e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0447396991585265e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15176.0,
+      "completions/mean_length": 6836.046875,
+      "completions/mean_terminated_length": 6606.896484375,
+      "completions/min_length": 785.0,
+      "completions/min_terminated_length": 785.0,
+      "entropy": 1.218759760260582,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0020856577903032303,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 293115984.0,
+      "reward": 0.21875,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999911785125732,
+      "sampling/importance_sampling_ratio/min": 2.784526441246271e-05,
+      "sampling/sampling_logp_difference/max": 10.488847732543945,
+      "sampling/sampling_logp_difference/mean": 0.022012067958712578,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 2.5695502699818462e-05,
+      "clip_ratio/high_mean": 7.549717793153832e-06,
+      "clip_ratio/low_mean": 4.6741323160404136e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.429104089671455e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15796.0,
+      "completions/mean_length": 7501.9921875,
+      "completions/mean_terminated_length": 7140.9345703125,
+      "completions/min_length": 1237.0,
+      "completions/min_terminated_length": 1237.0,
+      "entropy": 0.8940394818782806,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005163854919373989,
+      "learning_rate": 1e-05,
+      "loss": 0.0354,
+      "num_tokens": 294099503.0,
+      "reward": 0.328125,
+      "reward_std": 0.30904707312583923,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999276399612427,
+      "sampling/importance_sampling_ratio/min": 0.0006545600481331348,
+      "sampling/sampling_logp_difference/max": 7.331547260284424,
+      "sampling/sampling_logp_difference/mean": 0.020813245326280594,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 3.1606674838258186e-05,
+      "clip_ratio/high_mean": 9.45794374729303e-06,
+      "clip_ratio/low_mean": 4.5567895540443715e-05,
+      "clip_ratio/low_min": 4.458871444512624e-06,
+      "clip_ratio/region_mean": 5.502583962879726e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7204.828125,
+      "completions/mean_terminated_length": 6908.7255859375,
+      "completions/min_length": 846.0,
+      "completions/min_terminated_length": 846.0,
+      "entropy": 0.9961872175335884,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029277894645929337,
+      "learning_rate": 1e-05,
+      "loss": 0.0963,
+      "num_tokens": 295042105.0,
+      "reward": 0.390625,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000677108764648,
+      "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05,
+      "sampling/sampling_logp_difference/max": 10.872637748718262,
+      "sampling/sampling_logp_difference/mean": 0.020187582820653915,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 1.7963964182854397e-05,
+      "clip_ratio/high_mean": 5.194059781388205e-06,
+      "clip_ratio/low_mean": 1.8380221035840805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.357428081722901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15856.0,
+      "completions/mean_length": 6256.859375,
+      "completions/mean_terminated_length": 6013.80810546875,
+      "completions/min_length": 1006.0,
+      "completions/min_terminated_length": 1006.0,
+      "entropy": 0.9293600022792816,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032952844630926847,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 295867039.0,
+      "reward": 0.46875,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999649524688721,
+      "sampling/importance_sampling_ratio/min": 7.995560008566827e-05,
+      "sampling/sampling_logp_difference/max": 9.434039115905762,
+      "sampling/sampling_logp_difference/mean": 0.019491540268063545,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 7.577551059512189e-06,
+      "clip_ratio/high_mean": 1.8943877648780472e-06,
+      "clip_ratio/low_mean": 2.7479814093567256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9374201631071628e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15412.0,
+      "completions/mean_length": 7397.84375,
+      "completions/mean_terminated_length": 7032.552734375,
+      "completions/min_length": 923.0,
+      "completions/min_terminated_length": 923.0,
+      "entropy": 0.8508890569210052,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029417150653898716,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 296832843.0,
+      "reward": 0.375,
+      "reward_std": 0.2867125868797302,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000183582305908,
+      "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05,
+      "sampling/sampling_logp_difference/max": 10.93724250793457,
+      "sampling/sampling_logp_difference/mean": 0.01975393109023571,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 3.281225508544594e-05,
+      "clip_ratio/high_mean": 1.3302957199812226e-05,
+      "clip_ratio/low_mean": 5.109179869577929e-05,
+      "clip_ratio/low_min": 6.657612175331451e-06,
+      "clip_ratio/region_mean": 6.439475532715733e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14983.0,
+      "completions/mean_length": 6897.765625,
+      "completions/mean_terminated_length": 6823.07080078125,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.9046694040298462,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026788609102368355,
+      "learning_rate": 1e-05,
+      "loss": 0.0664,
+      "num_tokens": 297735285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3266732692718506,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999909520149231,
+      "sampling/importance_sampling_ratio/min": 0.001710799871943891,
+      "sampling/sampling_logp_difference/max": 6.370794296264648,
+      "sampling/sampling_logp_difference/mean": 0.020578179508447647,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 1.7319889593636617e-05,
+      "clip_ratio/high_mean": 5.168538336874917e-06,
+      "clip_ratio/low_mean": 7.019768918326008e-05,
+      "clip_ratio/low_min": 2.541147478041239e-05,
+      "clip_ratio/region_mean": 7.53662266106403e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15525.0,
+      "completions/mean_length": 6971.9921875,
+      "completions/mean_terminated_length": 6509.10595703125,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "entropy": 0.8658201694488525,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005915141198784113,
+      "learning_rate": 1e-05,
+      "loss": 0.0923,
+      "num_tokens": 298645124.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3742823898792267,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999268651008606,
+      "sampling/importance_sampling_ratio/min": 0.000970841443631798,
+      "sampling/sampling_logp_difference/max": 6.937347412109375,
+      "sampling/sampling_logp_difference/mean": 0.01906151883304119,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.8332865238335216e-05,
+      "clip_ratio/high_mean": 4.583216309583804e-06,
+      "clip_ratio/low_mean": 6.167940273371642e-05,
+      "clip_ratio/low_min": 5.969151516183047e-06,
+      "clip_ratio/region_mean": 6.626261847486603e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15054.0,
+      "completions/mean_length": 6545.6953125,
+      "completions/mean_terminated_length": 5889.80859375,
+      "completions/min_length": 800.0,
+      "completions/min_terminated_length": 800.0,
+      "entropy": 0.779609851539135,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0032792428974062204,
+      "learning_rate": 1e-05,
+      "loss": 0.097,
+      "num_tokens": 299503781.0,
+      "reward": 0.609375,
+      "reward_std": 0.38293448090553284,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999361634254456,
+      "sampling/importance_sampling_ratio/min": 0.002187495119869709,
+      "sampling/sampling_logp_difference/max": 6.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.017413027584552765,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.46246323235755e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.46246323235755e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7226.515625,
+      "completions/mean_terminated_length": 7006.736328125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9573849961161613,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005092279519885778,
+      "learning_rate": 1e-05,
+      "loss": 0.1102,
+      "num_tokens": 300447903.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999373555183411,
+      "sampling/importance_sampling_ratio/min": 0.000627054600045085,
+      "sampling/sampling_logp_difference/max": 7.374476909637451,
+      "sampling/sampling_logp_difference/mean": 0.021570835262537003,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 5.487269390869187e-06,
+      "clip_ratio/high_mean": 1.3718173477172968e-06,
+      "clip_ratio/low_mean": 4.7280102080549113e-05,
+      "clip_ratio/low_min": 1.0166083029616857e-05,
+      "clip_ratio/region_mean": 4.865191931457957e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14967.0,
+      "completions/mean_length": 5755.171875,
+      "completions/mean_terminated_length": 5323.10546875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8482184633612633,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005033228080719709,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 301206021.0,
+      "reward": 0.390625,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999947547912598,
+      "sampling/importance_sampling_ratio/min": 0.0014573346124961972,
+      "sampling/sampling_logp_difference/max": 6.531146049499512,
+      "sampling/sampling_logp_difference/mean": 0.018870476633310318,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 5.421346941147931e-06,
+      "clip_ratio/high_mean": 1.3553367352869827e-06,
+      "clip_ratio/low_mean": 1.6510994441887306e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.786633117717429e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 7098.7265625,
+      "completions/mean_terminated_length": 6875.88037109375,
+      "completions/min_length": 947.0,
+      "completions/min_terminated_length": 947.0,
+      "entropy": 0.87320177257061,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.007659573573619127,
+      "learning_rate": 1e-05,
+      "loss": 0.0707,
+      "num_tokens": 302133890.0,
+      "reward": 0.421875,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0012466582702472806,
+      "sampling/sampling_logp_difference/max": 6.687288761138916,
+      "sampling/sampling_logp_difference/mean": 0.019994346424937248,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 1.1556229310372146e-05,
+      "clip_ratio/high_mean": 2.8890573275930365e-06,
+      "clip_ratio/low_mean": 3.8744643916288624e-05,
+      "clip_ratio/low_min": 6.108287834649673e-06,
+      "clip_ratio/region_mean": 4.1633702039689524e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16139.0,
+      "completions/mean_length": 6399.96875,
+      "completions/mean_terminated_length": 6077.90283203125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9481896534562111,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014135175151750445,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 302972566.0,
+      "reward": 0.4140625,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0025698256213217974,
+      "sampling/sampling_logp_difference/max": 5.963917255401611,
+      "sampling/sampling_logp_difference/mean": 0.02073008380830288,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 6.59491388432798e-06,
+      "clip_ratio/high_mean": 2.545892130001448e-06,
+      "clip_ratio/low_mean": 4.620846755187813e-05,
+      "clip_ratio/low_min": 6.243132702365983e-06,
+      "clip_ratio/region_mean": 4.875435956819274e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 7298.078125,
+      "completions/mean_terminated_length": 7226.53564453125,
+      "completions/min_length": 1009.0,
+      "completions/min_terminated_length": 1009.0,
+      "entropy": 0.8719206526875496,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027898226398974657,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 303925976.0,
+      "reward": 0.484375,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.005236432887613773,
+      "sampling/sampling_logp_difference/max": 5.252114772796631,
+      "sampling/sampling_logp_difference/mean": 0.020944103598594666,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 1.052124343914329e-05,
+      "clip_ratio/high_mean": 2.6303108597858227e-06,
+      "clip_ratio/low_mean": 2.010384196182713e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.273415248055244e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14980.0,
+      "completions/mean_length": 5667.0390625,
+      "completions/mean_terminated_length": 5496.9287109375,
+      "completions/min_length": 974.0,
+      "completions/min_terminated_length": 974.0,
+      "entropy": 0.8791451379656792,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012764945859089494,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 304675157.0,
+      "reward": 0.390625,
+      "reward_std": 0.17965976893901825,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000383853912354,
+      "sampling/importance_sampling_ratio/min": 5.054428584116977e-06,
+      "sampling/sampling_logp_difference/max": 12.195245742797852,
+      "sampling/sampling_logp_difference/mean": 0.018928447738289833,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 9.578045592206763e-06,
+      "clip_ratio/high_mean": 2.3945113980516908e-06,
+      "clip_ratio/low_mean": 3.1114799753595435e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350931149270764e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15354.0,
+      "completions/max_terminated_length": 15354.0,
+      "completions/mean_length": 5874.4453125,
+      "completions/mean_terminated_length": 5874.4453125,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9577538818120956,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00509974779561162,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 305447038.0,
+      "reward": 0.515625,
+      "reward_std": 0.24777325987815857,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999423027038574,
+      "sampling/importance_sampling_ratio/min": 0.004791648127138615,
+      "sampling/sampling_logp_difference/max": 5.340880870819092,
+      "sampling/sampling_logp_difference/mean": 0.02114470861852169,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 1.0903062275247066e-05,
+      "clip_ratio/high_mean": 2.7257655688117666e-06,
+      "clip_ratio/low_mean": 4.784364205079328e-05,
+      "clip_ratio/low_min": 3.861600362142781e-06,
+      "clip_ratio/region_mean": 5.056940744907479e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 6197.5703125,
+      "completions/mean_terminated_length": 6035.88134765625,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.8665244281291962,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030849494505673647,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 306258023.0,
+      "reward": 0.515625,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998056888580322,
+      "sampling/importance_sampling_ratio/min": 0.000830297009088099,
+      "sampling/sampling_logp_difference/max": 7.093727111816406,
+      "sampling/sampling_logp_difference/mean": 0.021017421036958694,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 1.4299712574938894e-05,
+      "clip_ratio/high_mean": 4.3520980170796975e-06,
+      "clip_ratio/low_mean": 6.213493452378316e-05,
+      "clip_ratio/low_min": 1.0056635801447555e-05,
+      "clip_ratio/region_mean": 6.648703174505499e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 7522.578125,
+      "completions/mean_terminated_length": 7381.9208984375,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.8185881152749062,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002946985885500908,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 307240305.0,
+      "reward": 0.3125,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.005127199459820986,
+      "sampling/sampling_logp_difference/max": 5.273195743560791,
+      "sampling/sampling_logp_difference/mean": 0.01965932548046112,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.693051035545068e-05,
+      "clip_ratio/high_mean": 5.08456730585749e-06,
+      "clip_ratio/low_mean": 4.2052345861520735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.713691282631771e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14090.0,
+      "completions/mean_length": 6403.2265625,
+      "completions/mean_terminated_length": 6163.6884765625,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "entropy": 0.8359840363264084,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031181599479168653,
+      "learning_rate": 1e-05,
+      "loss": 0.072,
+      "num_tokens": 308079318.0,
+      "reward": 0.5,
+      "reward_std": 0.27145031094551086,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999215602874756,
+      "sampling/importance_sampling_ratio/min": 6.73715621815063e-05,
+      "sampling/sampling_logp_difference/max": 9.605287551879883,
+      "sampling/sampling_logp_difference/mean": 0.01963040418922901,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 1.3988919135954347e-05,
+      "clip_ratio/high_mean": 3.497229783988587e-06,
+      "clip_ratio/low_mean": 6.722658486069122e-05,
+      "clip_ratio/low_min": 1.858519090092159e-05,
+      "clip_ratio/region_mean": 7.072381458783639e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16148.0,
+      "completions/mean_length": 7954.03125,
+      "completions/mean_terminated_length": 7751.71240234375,
+      "completions/min_length": 632.0,
+      "completions/min_terminated_length": 632.0,
+      "entropy": 0.905990719795227,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002656223252415657,
+      "learning_rate": 1e-05,
+      "loss": 0.1022,
+      "num_tokens": 309117770.0,
+      "reward": 0.3828125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999536275863647,
+      "sampling/importance_sampling_ratio/min": 0.0003354826185386628,
+      "sampling/sampling_logp_difference/max": 7.999940395355225,
+      "sampling/sampling_logp_difference/mean": 0.020741507411003113,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.7610595023143105e-05,
+      "clip_ratio/high_mean": 4.402648755785776e-06,
+      "clip_ratio/low_mean": 4.337988764291367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.778253651238629e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6630.09375,
+      "completions/mean_terminated_length": 6315.45166015625,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.870736837387085,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0060529084876179695,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 309988894.0,
+      "reward": 0.515625,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998822212219238,
+      "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05,
+      "sampling/sampling_logp_difference/max": 10.716434478759766,
+      "sampling/sampling_logp_difference/mean": 0.02060208097100258,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 1.0448093235027045e-05,
+      "clip_ratio/high_mean": 2.6120233087567613e-06,
+      "clip_ratio/low_mean": 3.1030769946482906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.364279325523967e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15920.0,
+      "completions/max_terminated_length": 15920.0,
+      "completions/mean_length": 6679.6171875,
+      "completions/mean_terminated_length": 6679.6171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9812518879771233,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00400698184967041,
+      "learning_rate": 1e-05,
+      "loss": 0.0605,
+      "num_tokens": 310864013.0,
+      "reward": 0.421875,
+      "reward_std": 0.3295465111732483,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999049305915833,
+      "sampling/importance_sampling_ratio/min": 0.0020593837834894657,
+      "sampling/sampling_logp_difference/max": 6.1853485107421875,
+      "sampling/sampling_logp_difference/mean": 0.02098071575164795,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 2.124982574969181e-05,
+      "clip_ratio/high_mean": 7.736592579021817e-06,
+      "clip_ratio/low_mean": 2.900951585615985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.674610888992902e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14541.0,
+      "completions/mean_length": 5523.796875,
+      "completions/mean_terminated_length": 5173.4677734375,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9120645374059677,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005929585546255112,
+      "learning_rate": 1e-05,
+      "loss": 0.0362,
+      "num_tokens": 311589987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998446702957153,
+      "sampling/importance_sampling_ratio/min": 0.0010661041596904397,
+      "sampling/sampling_logp_difference/max": 6.843744277954102,
+      "sampling/sampling_logp_difference/mean": 0.019948206841945648,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 2.4486997745043482e-05,
+      "clip_ratio/high_mean": 8.219769085826556e-06,
+      "clip_ratio/low_mean": 5.346400575945154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.168377467474784e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15401.0,
+      "completions/mean_length": 6361.3671875,
+      "completions/mean_terminated_length": 6282.44873046875,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.8044678047299385,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006622390355914831,
+      "learning_rate": 1e-05,
+      "loss": 0.1023,
+      "num_tokens": 312424034.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3724474310874939,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000219345092773,
+      "sampling/importance_sampling_ratio/min": 0.0003157092141918838,
+      "sampling/sampling_logp_difference/max": 8.060688972473145,
+      "sampling/sampling_logp_difference/mean": 0.018907658755779266,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 1.0407376748844399e-05,
+      "clip_ratio/high_mean": 2.6018441872110998e-06,
+      "clip_ratio/low_mean": 5.925514369664597e-05,
+      "clip_ratio/low_min": 1.3324347946763737e-05,
+      "clip_ratio/region_mean": 6.185698703120579e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15883.0,
+      "completions/mean_length": 7109.0,
+      "completions/mean_terminated_length": 7035.96826171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9167275875806808,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004639944992959499,
+      "learning_rate": 1e-05,
+      "loss": 0.0861,
+      "num_tokens": 313353346.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3826971650123596,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999389052391052,
+      "sampling/importance_sampling_ratio/min": 0.0019070414127781987,
+      "sampling/sampling_logp_difference/max": 6.262202262878418,
+      "sampling/sampling_logp_difference/mean": 0.02155841514468193,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 3.959046694035351e-05,
+      "clip_ratio/high_mean": 1.0912523691786191e-05,
+      "clip_ratio/low_mean": 3.3944450819944905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.485697365907981e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6314.2734375,
+      "completions/mean_terminated_length": 6072.60009765625,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.8780038207769394,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.007643720600754023,
+      "learning_rate": 1e-05,
+      "loss": 0.0873,
+      "num_tokens": 314180717.0,
+      "reward": 0.4609375,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999802112579346,
+      "sampling/importance_sampling_ratio/min": 0.021285315975546837,
+      "sampling/sampling_logp_difference/max": 3.8497378826141357,
+      "sampling/sampling_logp_difference/mean": 0.01964358240365982,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 3.065382111344661e-05,
+      "clip_ratio/high_mean": 9.187473835936544e-06,
+      "clip_ratio/low_mean": 4.137891801292426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.056639065514901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6718.2265625,
+      "completions/mean_terminated_length": 6486.24853515625,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.8326799497008324,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050973957404494286,
+      "learning_rate": 1e-05,
+      "loss": 0.0109,
+      "num_tokens": 315060842.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3521803915500641,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000014066696167,
+      "sampling/importance_sampling_ratio/min": 0.0009130688849836588,
+      "sampling/sampling_logp_difference/max": 6.998699188232422,
+      "sampling/sampling_logp_difference/mean": 0.019501537084579468,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 8.624853762739804e-06,
+      "clip_ratio/high_mean": 2.156213440684951e-06,
+      "clip_ratio/low_mean": 1.8797969062234188e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0954182048171788e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16128.0,
+      "completions/mean_length": 8666.8359375,
+      "completions/mean_terminated_length": 7941.291015625,
+      "completions/min_length": 565.0,
+      "completions/min_terminated_length": 565.0,
+      "entropy": 0.9526705741882324,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019092690199613571,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 316190325.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999814629554749,
+      "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05,
+      "sampling/sampling_logp_difference/max": 10.249995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02051631174981594,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 2.147400391550036e-05,
+      "clip_ratio/high_mean": 6.434908300434472e-06,
+      "clip_ratio/low_mean": 3.521234066283796e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.164724816746457e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15164.0,
+      "completions/mean_length": 7661.8203125,
+      "completions/mean_terminated_length": 7002.16015625,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 0.8322782590985298,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019530428107827902,
+      "learning_rate": 1e-05,
+      "loss": 0.0729,
+      "num_tokens": 317191878.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21382391452789307,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 8.546619210392237e-05,
+      "sampling/sampling_logp_difference/max": 9.367389678955078,
+      "sampling/sampling_logp_difference/mean": 0.019894573837518692,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.9436202364886412e-05,
+      "clip_ratio/high_mean": 6.089704697842535e-06,
+      "clip_ratio/low_mean": 4.2698405422925134e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.878810955233348e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15934.0,
+      "completions/mean_length": 7024.859375,
+      "completions/mean_terminated_length": 6800.240234375,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.794853538274765,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031784537713974714,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 318109004.0,
+      "reward": 0.4921875,
+      "reward_std": 0.31800347566604614,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999352693557739,
+      "sampling/importance_sampling_ratio/min": 0.0002962362195830792,
+      "sampling/sampling_logp_difference/max": 8.124353408813477,
+      "sampling/sampling_logp_difference/mean": 0.018519200384616852,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 4.127455667912727e-06,
+      "clip_ratio/high_mean": 1.0318639169781818e-06,
+      "clip_ratio/low_mean": 4.342453667049995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.445640047379129e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 7282.1796875,
+      "completions/mean_terminated_length": 6912.1865234375,
+      "completions/min_length": 870.0,
+      "completions/min_terminated_length": 870.0,
+      "entropy": 0.904067650437355,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005080109462141991,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 319059075.0,
+      "reward": 0.4140625,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000062108039856,
+      "sampling/importance_sampling_ratio/min": 0.1194523349404335,
+      "sampling/sampling_logp_difference/max": 6.136754989624023,
+      "sampling/sampling_logp_difference/mean": 0.019978653639554977,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.608940076243016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.608940076243016e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15625.0,
+      "completions/mean_length": 7131.5234375,
+      "completions/mean_terminated_length": 6596.255859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.8849587142467499,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022667953744530678,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 319990046.0,
+      "reward": 0.46875,
+      "reward_std": 0.30221715569496155,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0370909757912159,
+      "sampling/sampling_logp_difference/max": 3.294381618499756,
+      "sampling/sampling_logp_difference/mean": 0.02037571743130684,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 1.5356635913121863e-05,
+      "clip_ratio/high_mean": 3.839158978280466e-06,
+      "clip_ratio/low_mean": 3.4950805911648786e-05,
+      "clip_ratio/low_min": 4.876336333836662e-06,
+      "clip_ratio/region_mean": 3.8789965287833184e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16205.0,
+      "completions/mean_length": 6655.4453125,
+      "completions/mean_terminated_length": 6578.84228515625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.7417122721672058,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00216497085057199,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 320860135.0,
+      "reward": 0.5625,
+      "reward_std": 0.3369230031967163,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 0.0005190494703128934,
+      "sampling/sampling_logp_difference/max": 7.563511371612549,
+      "sampling/sampling_logp_difference/mean": 0.01771342009305954,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 1.7605634639039636e-05,
+      "clip_ratio/high_mean": 5.297029474604642e-06,
+      "clip_ratio/low_mean": 5.688933060810086e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.218636053745286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15849.0,
+      "completions/mean_length": 7077.1640625,
+      "completions/mean_terminated_length": 6619.45068359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 0.8749325424432755,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0028338562697172165,
+      "learning_rate": 1e-05,
+      "loss": 0.0643,
+      "num_tokens": 321783852.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998220205307007,
+      "sampling/importance_sampling_ratio/min": 7.83290306571871e-06,
+      "sampling/sampling_logp_difference/max": 11.757177352905273,
+      "sampling/sampling_logp_difference/mean": 0.020299233496189117,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 7.301828190975357e-06,
+      "clip_ratio/high_mean": 1.8254570477438392e-06,
+      "clip_ratio/low_mean": 5.158197632226802e-05,
+      "clip_ratio/low_min": 3.735804057214409e-06,
+      "clip_ratio/region_mean": 5.340743223314348e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15329.0,
+      "completions/mean_length": 6034.296875,
+      "completions/mean_terminated_length": 5525.294921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.80014718323946,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022897711023688316,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 322572882.0,
+      "reward": 0.40625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999347925186157,
+      "sampling/importance_sampling_ratio/min": 0.0004105660773348063,
+      "sampling/sampling_logp_difference/max": 7.7979736328125,
+      "sampling/sampling_logp_difference/mean": 0.01858348958194256,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 9.364057859784225e-06,
+      "clip_ratio/high_mean": 3.351393047523743e-06,
+      "clip_ratio/low_mean": 4.186752630630508e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5218919240141986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 8172.109375,
+      "completions/mean_terminated_length": 7838.29248046875,
+      "completions/min_length": 733.0,
+      "completions/min_terminated_length": 733.0,
+      "entropy": 0.8732693120837212,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003263789461925626,
+      "learning_rate": 1e-05,
+      "loss": 0.0356,
+      "num_tokens": 323640904.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3237774670124054,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999354481697083,
+      "sampling/importance_sampling_ratio/min": 9.27252222027164e-06,
+      "sampling/sampling_logp_difference/max": 11.588455200195312,
+      "sampling/sampling_logp_difference/mean": 0.0208889190107584,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 2.0998899799451465e-05,
+      "clip_ratio/high_mean": 6.692962131182867e-06,
+      "clip_ratio/low_mean": 4.261424010110204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.930720297124935e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 7699.203125,
+      "completions/mean_terminated_length": 7419.04833984375,
+      "completions/min_length": 1225.0,
+      "completions/min_terminated_length": 1225.0,
+      "entropy": 0.8296505436301231,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0042716520838439465,
+      "learning_rate": 1e-05,
+      "loss": 0.0937,
+      "num_tokens": 324643858.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874234199524,
+      "sampling/importance_sampling_ratio/min": 0.00022192654432728887,
+      "sampling/sampling_logp_difference/max": 8.413164138793945,
+      "sampling/sampling_logp_difference/mean": 0.018926654011011124,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 7.061349151626928e-06,
+      "clip_ratio/high_mean": 1.765337287906732e-06,
+      "clip_ratio/low_mean": 4.5005243464402156e-05,
+      "clip_ratio/low_min": 3.861838649754645e-06,
+      "clip_ratio/region_mean": 4.6770580411248375e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16364.0,
+      "completions/max_terminated_length": 16364.0,
+      "completions/mean_length": 7450.1640625,
+      "completions/mean_terminated_length": 7450.1640625,
+      "completions/min_length": 910.0,
+      "completions/min_terminated_length": 910.0,
+      "entropy": 1.0400195196270943,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033558050636202097,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 325617687.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999459385871887,
+      "sampling/importance_sampling_ratio/min": 0.039920732378959656,
+      "sampling/sampling_logp_difference/max": 3.2208595275878906,
+      "sampling/sampling_logp_difference/mean": 0.02249298244714737,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 1.3147802746971138e-05,
+      "clip_ratio/high_mean": 3.2869506867427845e-06,
+      "clip_ratio/low_mean": 2.4451034505545977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7737984851228248e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15342.0,
+      "completions/mean_length": 6799.0703125,
+      "completions/mean_terminated_length": 6723.5986328125,
+      "completions/min_length": 1708.0,
+      "completions/min_terminated_length": 1708.0,
+      "entropy": 0.9737623482942581,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005797459278255701,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 326508384.0,
+      "reward": 0.3125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999321699142456,
+      "sampling/importance_sampling_ratio/min": 7.535634836131067e-07,
+      "sampling/sampling_logp_difference/max": 14.0984525680542,
+      "sampling/sampling_logp_difference/mean": 0.021543748676776886,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 3.3594023989280686e-06,
+      "clip_ratio/high_mean": 8.398505997320171e-07,
+      "clip_ratio/low_mean": 2.3457610382138228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4297460981870245e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 7034.3671875,
+      "completions/mean_terminated_length": 6654.30078125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8749603256583214,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002258980879560113,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 327426407.0,
+      "reward": 0.4609375,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999661445617676,
+      "sampling/importance_sampling_ratio/min": 0.008719252422451973,
+      "sampling/sampling_logp_difference/max": 4.742221832275391,
+      "sampling/sampling_logp_difference/mean": 0.01997346058487892,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 2.823375348270929e-05,
+      "clip_ratio/high_mean": 7.058438370677322e-06,
+      "clip_ratio/low_mean": 4.9395109726901865e-05,
+      "clip_ratio/low_min": 1.636556044104509e-05,
+      "clip_ratio/region_mean": 5.6453548268109444e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15240.0,
+      "completions/mean_length": 6623.078125,
+      "completions/mean_terminated_length": 6388.81640625,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.858784057199955,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002420129720121622,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 328292985.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537417411804,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998596906661987,
+      "sampling/importance_sampling_ratio/min": 0.00014900295354891568,
+      "sampling/sampling_logp_difference/max": 8.811544418334961,
+      "sampling/sampling_logp_difference/mean": 0.019645996391773224,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 1.8078507309837732e-05,
+      "clip_ratio/high_mean": 6.468551191574079e-06,
+      "clip_ratio/low_mean": 4.051302585139638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.698157727034413e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15229.0,
+      "completions/mean_length": 5902.4765625,
+      "completions/mean_terminated_length": 5564.36279296875,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.904740035533905,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004107976797968149,
+      "learning_rate": 1e-05,
+      "loss": 0.0824,
+      "num_tokens": 329067006.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3945493996143341,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999526143074036,
+      "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05,
+      "sampling/sampling_logp_difference/max": 11.37439250946045,
+      "sampling/sampling_logp_difference/mean": 0.019582755863666534,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 2.553658168835682e-05,
+      "clip_ratio/high_mean": 7.276365181496658e-06,
+      "clip_ratio/low_mean": 1.7552573126522475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.482893796695862e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6425.6015625,
+      "completions/mean_terminated_length": 6267.5322265625,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.964553713798523,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003208522219210863,
+      "learning_rate": 1e-05,
+      "loss": 0.0164,
+      "num_tokens": 329910691.0,
+      "reward": 0.359375,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999419450759888,
+      "sampling/importance_sampling_ratio/min": 0.00137569778598845,
+      "sampling/sampling_logp_difference/max": 6.588794231414795,
+      "sampling/sampling_logp_difference/mean": 0.021154657006263733,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 6.8712420215888415e-06,
+      "clip_ratio/high_mean": 1.7178105053972104e-06,
+      "clip_ratio/low_mean": 4.0991827404468495e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2709637853022286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 8006.4453125,
+      "completions/mean_terminated_length": 7594.43408203125,
+      "completions/min_length": 1235.0,
+      "completions/min_terminated_length": 1235.0,
+      "entropy": 0.8980336412787437,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002898421371355653,
+      "learning_rate": 1e-05,
+      "loss": 0.0815,
+      "num_tokens": 330956332.0,
+      "reward": 0.4296875,
+      "reward_std": 0.20175684988498688,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 9.378339746035635e-05,
+      "sampling/sampling_logp_difference/max": 9.27452278137207,
+      "sampling/sampling_logp_difference/mean": 0.021021340042352676,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.2689344689297286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2689344689297286e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15484.0,
+      "completions/max_terminated_length": 15484.0,
+      "completions/mean_length": 7068.828125,
+      "completions/mean_terminated_length": 7068.828125,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.9865007549524307,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0037063576746731997,
+      "learning_rate": 1e-05,
+      "loss": 0.0313,
+      "num_tokens": 331880918.0,
+      "reward": 0.3203125,
+      "reward_std": 0.17859892547130585,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0001819290773710236,
+      "sampling/sampling_logp_difference/max": 8.611893653869629,
+      "sampling/sampling_logp_difference/mean": 0.02072504535317421,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 5.845633268108941e-06,
+      "clip_ratio/high_mean": 1.4614083170272352e-06,
+      "clip_ratio/low_mean": 3.207486906831036e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353627721480734e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 7379.390625,
+      "completions/mean_terminated_length": 7236.4609375,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.8977236375212669,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001972826896235347,
+      "learning_rate": 1e-05,
+      "loss": 0.0228,
+      "num_tokens": 332849112.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 2.820451663865242e-05,
+      "sampling/sampling_logp_difference/max": 10.476028442382812,
+      "sampling/sampling_logp_difference/mean": 0.019411223009228706,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 4.875385002378607e-06,
+      "clip_ratio/high_mean": 1.2188462505946518e-06,
+      "clip_ratio/low_mean": 2.3530714997832547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.47495612484272e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15517.0,
+      "completions/mean_length": 6867.9609375,
+      "completions/mean_terminated_length": 6793.03125,
+      "completions/min_length": 760.0,
+      "completions/min_terminated_length": 760.0,
+      "entropy": 0.9244343340396881,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.006926023401319981,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 333746179.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1433562934398651,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.0003875594411510974,
+      "sampling/sampling_logp_difference/max": 7.8556413650512695,
+      "sampling/sampling_logp_difference/mean": 0.020311862230300903,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 1.5651628245905158e-05,
+      "clip_ratio/high_mean": 4.836261211949022e-06,
+      "clip_ratio/low_mean": 5.268017821435933e-05,
+      "clip_ratio/low_min": 3.950945028918795e-06,
+      "clip_ratio/region_mean": 5.751643902840442e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 7525.375,
+      "completions/mean_terminated_length": 6855.3955078125,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9207312315702438,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0047226278111338615,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 334731027.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3353874683380127,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999615550041199,
+      "sampling/importance_sampling_ratio/min": 0.00029753465787507594,
+      "sampling/sampling_logp_difference/max": 8.119979858398438,
+      "sampling/sampling_logp_difference/mean": 0.021496692672371864,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 3.815379886873416e-05,
+      "clip_ratio/high_mean": 9.53844971718354e-06,
+      "clip_ratio/low_mean": 4.519663821156428e-05,
+      "clip_ratio/low_min": 2.775434040813707e-06,
+      "clip_ratio/region_mean": 5.473508826980833e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16251.0,
+      "completions/mean_length": 6841.0625,
+      "completions/mean_terminated_length": 6453.13818359375,
+      "completions/min_length": 689.0,
+      "completions/min_terminated_length": 689.0,
+      "entropy": 0.8979457840323448,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004971448332071304,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 335631243.0,
+      "reward": 0.390625,
+      "reward_std": 0.2596156895160675,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999934196472168,
+      "sampling/importance_sampling_ratio/min": 9.655764188210014e-06,
+      "sampling/sampling_logp_difference/max": 11.547955513000488,
+      "sampling/sampling_logp_difference/mean": 0.020256079733371735,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 4.162365712545579e-06,
+      "clip_ratio/high_mean": 1.0405914281363948e-06,
+      "clip_ratio/low_mean": 3.1563491688757495e-05,
+      "clip_ratio/low_min": 3.1228139505401487e-06,
+      "clip_ratio/region_mean": 3.260408311689389e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15060.0,
+      "completions/mean_length": 6919.8046875,
+      "completions/mean_terminated_length": 6454.35205078125,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9241961911320686,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038604787550866604,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 336537162.0,
+      "reward": 0.375,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998080730438232,
+      "sampling/importance_sampling_ratio/min": 0.0009118975722230971,
+      "sampling/sampling_logp_difference/max": 6.999982833862305,
+      "sampling/sampling_logp_difference/mean": 0.02030865103006363,
+      "step": 384
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 336537162,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-384/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-384/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/README.md b/dapo_milora_plus_20251201_131939/checkpoint-448/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-448/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/adapter_config.json
@@ -0,0 +1,40 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-448/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/latest b/dapo_milora_plus_20251201_131939/checkpoint-448/latest
new file mode 100644
index 0000000000000000000000000000000000000000..6c83691d1f18f1aa59c0994e76f1e0d010c88273
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/latest
@@ -0,0 +1 @@
+global_step448
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-448/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-448/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-448/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cfb0d659f3b7dfbfb24866f32ed103e61c28673
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/trainer_state.json
@@ -0,0 +1,13922 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.41214351425942963,
+  "eval_steps": 500,
+  "global_step": 448,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 9.941184316630824e-06,
+      "clip_ratio/high_mean": 2.485296079157706e-06,
+      "clip_ratio/low_mean": 2.6134909091979353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8620205910101504e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 8426.1015625,
+      "completions/mean_terminated_length": 7965.72705078125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8188603445887566,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030983765609562397,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 162199765.0,
+      "reward": 0.25,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411106109619,
+      "sampling/importance_sampling_ratio/min": 0.0009119694004766643,
+      "sampling/sampling_logp_difference/max": 6.999904155731201,
+      "sampling/sampling_logp_difference/mean": 0.02070600539445877,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 2.612139087432297e-05,
+      "clip_ratio/high_mean": 6.530347718580742e-06,
+      "clip_ratio/low_mean": 3.7853451885894174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.438379949078808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15904.0,
+      "completions/mean_length": 7154.2109375,
+      "completions/mean_terminated_length": 6856.4755859375,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "entropy": 0.9913735538721085,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003430198412388563,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 163133232.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2120065689086914,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000275373458862,
+      "sampling/importance_sampling_ratio/min": 0.00042929715709760785,
+      "sampling/sampling_logp_difference/max": 7.753361225128174,
+      "sampling/sampling_logp_difference/mean": 0.02190260961651802,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 3.1841454983805306e-06,
+      "clip_ratio/high_mean": 7.960363745951327e-07,
+      "clip_ratio/low_mean": 3.384581600585079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4641852380445926e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 7693.1328125,
+      "completions/mean_terminated_length": 7412.7822265625,
+      "completions/min_length": 1077.0,
+      "completions/min_terminated_length": 1077.0,
+      "entropy": 0.9887127950787544,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002780586015433073,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 164134393.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999028444290161,
+      "sampling/importance_sampling_ratio/min": 3.559096626304381e-07,
+      "sampling/sampling_logp_difference/max": 14.848588943481445,
+      "sampling/sampling_logp_difference/mean": 0.021110571920871735,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 9.770586984814145e-06,
+      "clip_ratio/high_mean": 5.008155312680174e-06,
+      "clip_ratio/low_mean": 5.182203130971175e-05,
+      "clip_ratio/low_min": 1.5574546068819473e-05,
+      "clip_ratio/region_mean": 5.683018616764457e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 7072.1484375,
+      "completions/mean_terminated_length": 6771.76611328125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.861792616546154,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030156150460243225,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 165063412.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998926520347595,
+      "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06,
+      "sampling/sampling_logp_difference/max": 12.999247550964355,
+      "sampling/sampling_logp_difference/mean": 0.019325289875268936,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 2.2510209873871645e-05,
+      "clip_ratio/high_mean": 6.455301331698138e-06,
+      "clip_ratio/low_mean": 6.156819108582567e-05,
+      "clip_ratio/low_min": 5.763157332694391e-06,
+      "clip_ratio/region_mean": 6.802349253121065e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15062.0,
+      "completions/mean_length": 7353.421875,
+      "completions/mean_terminated_length": 7062.11279296875,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "entropy": 0.8961873054504395,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034921523183584213,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 166024306.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.0005124400486238301,
+      "sampling/sampling_logp_difference/max": 7.576326847076416,
+      "sampling/sampling_logp_difference/mean": 0.019593238830566406,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.3040991007073899e-05,
+      "clip_ratio/high_mean": 4.292725350296678e-06,
+      "clip_ratio/low_mean": 5.347559840629401e-05,
+      "clip_ratio/low_min": 6.613406640099129e-06,
+      "clip_ratio/region_mean": 5.776832381343411e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15604.0,
+      "completions/mean_length": 7348.03125,
+      "completions/mean_terminated_length": 6903.63916015625,
+      "completions/min_length": 1619.0,
+      "completions/min_terminated_length": 1619.0,
+      "entropy": 0.824029266834259,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027784397825598717,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 166984982.0,
+      "reward": 0.40625,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 0.0010020677000284195,
+      "sampling/sampling_logp_difference/max": 6.905689716339111,
+      "sampling/sampling_logp_difference/mean": 0.01857386901974678,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 3.330808067403268e-05,
+      "clip_ratio/high_mean": 1.0969530649163062e-05,
+      "clip_ratio/low_mean": 3.2080681648949394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3050211388617754e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16358.0,
+      "completions/mean_length": 7290.4765625,
+      "completions/mean_terminated_length": 6920.82080078125,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8884479627013206,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004110465291887522,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 167936971.0,
+      "reward": 0.4375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06,
+      "sampling/sampling_logp_difference/max": 13.219663619995117,
+      "sampling/sampling_logp_difference/mean": 0.019696572795510292,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 9.77357763076725e-06,
+      "clip_ratio/high_mean": 2.4433944076918124e-06,
+      "clip_ratio/low_mean": 3.466498992565903e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.710838473125477e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 7803.625,
+      "completions/mean_terminated_length": 6833.66943359375,
+      "completions/min_length": 929.0,
+      "completions/min_terminated_length": 929.0,
+      "entropy": 0.8326860442757607,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002410614863038063,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "num_tokens": 168955683.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0008801451185718179,
+      "sampling/sampling_logp_difference/max": 7.035423755645752,
+      "sampling/sampling_logp_difference/mean": 0.018545793369412422,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.4602125929741305e-05,
+      "clip_ratio/high_mean": 3.6505314824353263e-06,
+      "clip_ratio/low_mean": 3.4781527119776e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8432058772741584e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6804.34375,
+      "completions/mean_terminated_length": 6495.322265625,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.9669496119022369,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034376555122435093,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 169845823.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31534504890441895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 1.767780588579626e-08,
+      "sampling/sampling_logp_difference/max": 17.850955963134766,
+      "sampling/sampling_logp_difference/mean": 0.020515555515885353,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 1.5814722473805887e-05,
+      "clip_ratio/high_mean": 3.953680618451472e-06,
+      "clip_ratio/low_mean": 3.574208744794305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9695768407455034e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 6827.9609375,
+      "completions/mean_terminated_length": 6105.23583984375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 0.8833946585655212,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026675171684473753,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 170738210.0,
+      "reward": 0.421875,
+      "reward_std": 0.2698654532432556,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000019907951355,
+      "sampling/importance_sampling_ratio/min": 0.002906275913119316,
+      "sampling/sampling_logp_difference/max": 5.840882778167725,
+      "sampling/sampling_logp_difference/mean": 0.019948139786720276,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.6623121837255894e-05,
+      "clip_ratio/high_mean": 4.1557804593139736e-06,
+      "clip_ratio/low_mean": 6.462372630267055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.877950727357529e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15725.0,
+      "completions/mean_length": 7377.984375,
+      "completions/mean_terminated_length": 7307.07080078125,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8881714344024658,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0039620306342840195,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 171705152.0,
+      "reward": 0.3359375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05,
+      "sampling/sampling_logp_difference/max": 10.614632606506348,
+      "sampling/sampling_logp_difference/mean": 0.01964445412158966,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 9.639111340220552e-06,
+      "clip_ratio/high_mean": 2.409777835055138e-06,
+      "clip_ratio/low_mean": 2.775239624952519e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0162174198267167e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15265.0,
+      "completions/mean_length": 6051.8828125,
+      "completions/mean_terminated_length": 5543.74560546875,
+      "completions/min_length": 819.0,
+      "completions/min_terminated_length": 819.0,
+      "entropy": 0.8851477280259132,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0040458571165800095,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 172501881.0,
+      "reward": 0.4296875,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999410510063171,
+      "sampling/importance_sampling_ratio/min": 0.0021976607386022806,
+      "sampling/sampling_logp_difference/max": 6.120361804962158,
+      "sampling/sampling_logp_difference/mean": 0.01957303285598755,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 9.72708312474424e-06,
+      "clip_ratio/high_mean": 3.529455852913088e-06,
+      "clip_ratio/low_mean": 5.158422732165491e-05,
+      "clip_ratio/low_min": 1.1939961495954776e-05,
+      "clip_ratio/region_mean": 5.5113683174567996e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7830.171875,
+      "completions/mean_terminated_length": 7409.4912109375,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.9070459827780724,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005941574461758137,
+      "learning_rate": 1e-05,
+      "loss": 0.0427,
+      "num_tokens": 173522391.0,
+      "reward": 0.34375,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000017881393433,
+      "sampling/importance_sampling_ratio/min": 0.00011712420382536948,
+      "sampling/sampling_logp_difference/max": 9.052275657653809,
+      "sampling/sampling_logp_difference/mean": 0.021295130252838135,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 5.5543214330100454e-06,
+      "clip_ratio/high_mean": 1.3885803582525114e-06,
+      "clip_ratio/low_mean": 1.718775109793569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8576331683561875e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15443.0,
+      "completions/mean_length": 7520.6796875,
+      "completions/mean_terminated_length": 6769.55078125,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.8843575045466423,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025851845275610685,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 174504534.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 0.00039556476986035705,
+      "sampling/sampling_logp_difference/max": 7.835196018218994,
+      "sampling/sampling_logp_difference/mean": 0.02016005665063858,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 1.0145481155632297e-05,
+      "clip_ratio/high_mean": 2.536370288908074e-06,
+      "clip_ratio/low_mean": 3.617897255026037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.871534295285528e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 7382.1875,
+      "completions/mean_terminated_length": 6861.42138671875,
+      "completions/min_length": 934.0,
+      "completions/min_terminated_length": 934.0,
+      "entropy": 0.916313610970974,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004170550964772701,
+      "learning_rate": 1e-05,
+      "loss": 0.047,
+      "num_tokens": 175472574.0,
+      "reward": 0.46875,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05,
+      "sampling/sampling_logp_difference/max": 10.481352806091309,
+      "sampling/sampling_logp_difference/mean": 0.020749717950820923,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.83663013963087e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.83663013963087e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 6122.453125,
+      "completions/mean_terminated_length": 6041.6533203125,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.8984386026859283,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 176275568.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 7.88934721640544e-06,
+      "sampling/sampling_logp_difference/max": 11.74999713897705,
+      "sampling/sampling_logp_difference/mean": 0.020278753712773323,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 1.4535152331518475e-05,
+      "clip_ratio/high_mean": 3.6337880828796187e-06,
+      "clip_ratio/low_mean": 4.3961883989140915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7595671958333696e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15547.0,
+      "completions/mean_length": 4983.2890625,
+      "completions/mean_terminated_length": 4709.67236328125,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.825260303914547,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004848882555961609,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 176932549.0,
+      "reward": 0.6484375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616146087646,
+      "sampling/importance_sampling_ratio/min": 1.626804078114219e-05,
+      "sampling/sampling_logp_difference/max": 11.026308059692383,
+      "sampling/sampling_logp_difference/mean": 0.017959970980882645,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.1141860795760294e-05,
+      "clip_ratio/high_mean": 2.7854651989400736e-06,
+      "clip_ratio/low_mean": 4.2418692146384274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5204157913758536e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 5766.5234375,
+      "completions/mean_terminated_length": 5511.7041015625,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.9016259610652924,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004749474115669727,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 177691752.0,
+      "reward": 0.5,
+      "reward_std": 0.2738044261932373,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000141859054565,
+      "sampling/importance_sampling_ratio/min": 8.927558155846782e-06,
+      "sampling/sampling_logp_difference/max": 11.626367568969727,
+      "sampling/sampling_logp_difference/mean": 0.019118282943964005,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 5.5243735914700665e-06,
+      "clip_ratio/high_mean": 2.1587275114143267e-06,
+      "clip_ratio/low_mean": 4.609663824339805e-05,
+      "clip_ratio/low_min": 3.983555870945565e-06,
+      "clip_ratio/region_mean": 4.8255366664307076e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15696.0,
+      "completions/mean_length": 6993.671875,
+      "completions/mean_terminated_length": 6768.30419921875,
+      "completions/min_length": 889.0,
+      "completions/min_terminated_length": 889.0,
+      "entropy": 0.9074988812208176,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004418120253831148,
+      "learning_rate": 1e-05,
+      "loss": 0.1135,
+      "num_tokens": 178603454.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000037670135498,
+      "sampling/importance_sampling_ratio/min": 0.0018135923892259598,
+      "sampling/sampling_logp_difference/max": 6.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01957814022898674,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 5.126943051436683e-06,
+      "clip_ratio/high_mean": 1.2817357628591708e-06,
+      "clip_ratio/low_mean": 2.7488794444252562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.877053032079857e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 7445.1328125,
+      "completions/mean_terminated_length": 6849.20849609375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9255013465881348,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00237120408564806,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 179577063.0,
+      "reward": 0.40625,
+      "reward_std": 0.21040897071361542,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725818634033,
+      "sampling/importance_sampling_ratio/min": 9.651589061832055e-05,
+      "sampling/sampling_logp_difference/max": 9.245802879333496,
+      "sampling/sampling_logp_difference/mean": 0.02165937051177025,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.8956294752570102e-05,
+      "clip_ratio/high_mean": 4.7390736881425255e-06,
+      "clip_ratio/low_mean": 2.6486316301088664e-05,
+      "clip_ratio/low_min": 3.516273409331916e-06,
+      "clip_ratio/region_mean": 3.122539010291803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6120.5546875,
+      "completions/mean_terminated_length": 5703.34130859375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8181199952960014,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004715202376246452,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 180380422.0,
+      "reward": 0.5,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999874472618103,
+      "sampling/importance_sampling_ratio/min": 0.004350374918431044,
+      "sampling/sampling_logp_difference/max": 5.437493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018377620726823807,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 5.594843969447538e-06,
+      "clip_ratio/high_mean": 2.376495558564784e-06,
+      "clip_ratio/low_mean": 3.4097628713425365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6474124044616474e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 6351.203125,
+      "completions/mean_terminated_length": 5857.78662109375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.8798654451966286,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003063712501898408,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 181212776.0,
+      "reward": 0.453125,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999946355819702,
+      "sampling/importance_sampling_ratio/min": 7.891544555604924e-06,
+      "sampling/sampling_logp_difference/max": 11.74971866607666,
+      "sampling/sampling_logp_difference/mean": 0.019523698836565018,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.544438988001275e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.544438988001275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14180.0,
+      "completions/mean_length": 6330.046875,
+      "completions/mean_terminated_length": 6170.46044921875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.8319354206323624,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033194730058312416,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 182041910.0,
+      "reward": 0.453125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998994469642639,
+      "sampling/importance_sampling_ratio/min": 0.00010535263572819531,
+      "sampling/sampling_logp_difference/max": 9.158197402954102,
+      "sampling/sampling_logp_difference/mean": 0.018981872126460075,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7156292415165808e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7156292415165808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15982.0,
+      "completions/mean_length": 6665.2890625,
+      "completions/mean_terminated_length": 6351.7822265625,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9336326420307159,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004492956213653088,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 182914843.0,
+      "reward": 0.3828125,
+      "reward_std": 0.14807432889938354,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 0.011399568989872932,
+      "sampling/sampling_logp_difference/max": 4.474179744720459,
+      "sampling/sampling_logp_difference/mean": 0.02088768407702446,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.2495465802639956e-05,
+      "clip_ratio/high_mean": 9.084843100026774e-06,
+      "clip_ratio/low_mean": 5.4809036328151706e-05,
+      "clip_ratio/low_min": 8.953898031904828e-06,
+      "clip_ratio/region_mean": 6.389387954186532e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16064.0,
+      "completions/mean_length": 5393.9140625,
+      "completions/mean_terminated_length": 5039.39501953125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.7864786610007286,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003816079581156373,
+      "learning_rate": 1e-05,
+      "loss": -0.004,
+      "num_tokens": 183628152.0,
+      "reward": 0.546875,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998779892921448,
+      "sampling/importance_sampling_ratio/min": 0.003246711567044258,
+      "sampling/sampling_logp_difference/max": 5.730112552642822,
+      "sampling/sampling_logp_difference/mean": 0.018448319286108017,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 8.638648068881594e-06,
+      "clip_ratio/high_mean": 2.1596620172203984e-06,
+      "clip_ratio/low_mean": 1.6896704778446292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9056366909353528e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 7161.5,
+      "completions/mean_terminated_length": 7015.111328125,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.915394201874733,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003666195785626769,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 184562352.0,
+      "reward": 0.3671875,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00025550799909979105,
+      "sampling/sampling_logp_difference/max": 8.272256851196289,
+      "sampling/sampling_logp_difference/mean": 0.019755780696868896,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 6.424931598303374e-06,
+      "clip_ratio/high_mean": 1.6062328995758435e-06,
+      "clip_ratio/low_mean": 2.49038239417132e-05,
+      "clip_ratio/low_min": 4.00025601265952e-06,
+      "clip_ratio/region_mean": 2.651005689813246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15408.0,
+      "completions/mean_length": 7957.671875,
+      "completions/mean_terminated_length": 7685.8544921875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "entropy": 1.1176252663135529,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025940234772861004,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 185606670.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999893844127655,
+      "sampling/importance_sampling_ratio/min": 0.0007622809498570859,
+      "sampling/sampling_logp_difference/max": 7.179195404052734,
+      "sampling/sampling_logp_difference/mean": 0.02338646724820137,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 1.9903963220713194e-05,
+      "clip_ratio/high_mean": 5.829163114867697e-06,
+      "clip_ratio/low_mean": 4.4742550926457625e-05,
+      "clip_ratio/low_min": 3.5803282116830815e-06,
+      "clip_ratio/region_mean": 5.057171370026481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 7060.6640625,
+      "completions/mean_terminated_length": 6759.9111328125,
+      "completions/min_length": 1460.0,
+      "completions/min_terminated_length": 1460.0,
+      "entropy": 0.9148540124297142,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004315398633480072,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 186526883.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0004585353017318994,
+      "sampling/sampling_logp_difference/max": 7.687473297119141,
+      "sampling/sampling_logp_difference/mean": 0.01967843994498253,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 1.147099328591139e-05,
+      "clip_ratio/high_mean": 2.8677483214778476e-06,
+      "clip_ratio/low_mean": 2.8967988555450574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1835736763241584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15596.0,
+      "completions/mean_length": 6649.6640625,
+      "completions/mean_terminated_length": 6416.04052734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9298559054732323,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030786178540438414,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 187397536.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000005841255188,
+      "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07,
+      "sampling/sampling_logp_difference/max": 14.929608345031738,
+      "sampling/sampling_logp_difference/mean": 0.020215414464473724,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 2.2768570943298982e-05,
+      "clip_ratio/high_mean": 5.692142735824746e-06,
+      "clip_ratio/low_mean": 3.249637484259438e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8188517464732286e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 8292.015625,
+      "completions/mean_terminated_length": 7823.8837890625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.8232023045420647,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002438523108139634,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 188477778.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.005636279005557299,
+      "sampling/sampling_logp_difference/max": 5.178531169891357,
+      "sampling/sampling_logp_difference/mean": 0.018984414637088776,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 2.0840709566982696e-05,
+      "clip_ratio/high_mean": 6.135253556749376e-06,
+      "clip_ratio/low_mean": 2.255633432923787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.869158777230041e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15991.0,
+      "completions/mean_length": 7600.9765625,
+      "completions/mean_terminated_length": 6936.71484375,
+      "completions/min_length": 995.0,
+      "completions/min_terminated_length": 995.0,
+      "entropy": 0.8689917623996735,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004773247055709362,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 189470655.0,
+      "reward": 0.40625,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.001327168894931674,
+      "sampling/sampling_logp_difference/max": 6.624707221984863,
+      "sampling/sampling_logp_difference/mean": 0.018666012212634087,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 9.837458947004052e-06,
+      "clip_ratio/high_mean": 2.459364736751013e-06,
+      "clip_ratio/low_mean": 6.463955219260242e-05,
+      "clip_ratio/low_min": 1.0895145351241808e-05,
+      "clip_ratio/region_mean": 6.70989177251613e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16215.0,
+      "completions/mean_length": 7600.34375,
+      "completions/mean_terminated_length": 6855.96630859375,
+      "completions/min_length": 1335.0,
+      "completions/min_terminated_length": 1335.0,
+      "entropy": 0.7636929750442505,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004298723768442869,
+      "learning_rate": 1e-05,
+      "loss": 0.145,
+      "num_tokens": 190462227.0,
+      "reward": 0.515625,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999310374259949,
+      "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05,
+      "sampling/sampling_logp_difference/max": 9.996363639831543,
+      "sampling/sampling_logp_difference/mean": 0.018035393208265305,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 1.4060602325116633e-05,
+      "clip_ratio/high_mean": 3.5151505812791584e-06,
+      "clip_ratio/low_mean": 2.6516039497437305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.003119024924672e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15151.0,
+      "completions/mean_length": 6512.0,
+      "completions/mean_terminated_length": 6434.267578125,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.9043584689497948,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006741553544998169,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 191312483.0,
+      "reward": 0.484375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 1.778468504198827e-05,
+      "sampling/sampling_logp_difference/max": 10.937172889709473,
+      "sampling/sampling_logp_difference/mean": 0.020878732204437256,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 1.7356085209030425e-05,
+      "clip_ratio/high_mean": 4.339021302257606e-06,
+      "clip_ratio/low_mean": 2.8831826739406097e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.317084781429003e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7178.6875,
+      "completions/mean_terminated_length": 6565.00048828125,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.8899475410580635,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00281486171297729,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 192251235.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2240736484527588,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999714493751526,
+      "sampling/importance_sampling_ratio/min": 9.012543159769848e-05,
+      "sampling/sampling_logp_difference/max": 9.314308166503906,
+      "sampling/sampling_logp_difference/mean": 0.020196784287691116,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.5558084214717383e-05,
+      "clip_ratio/high_mean": 3.889521053679346e-06,
+      "clip_ratio/low_mean": 3.0248688972278615e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.413820991227112e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15501.0,
+      "completions/max_terminated_length": 15501.0,
+      "completions/mean_length": 6602.5625,
+      "completions/mean_terminated_length": 6602.5625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.9266818463802338,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005070593673735857,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193116763.0,
+      "reward": 0.53125,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999746680259705,
+      "sampling/importance_sampling_ratio/min": 2.726537559283315e-06,
+      "sampling/sampling_logp_difference/max": 12.812478065490723,
+      "sampling/sampling_logp_difference/mean": 0.020026464015245438,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.188727416476468e-06,
+      "clip_ratio/high_mean": 1.047181854119117e-06,
+      "clip_ratio/low_mean": 2.959152834591805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.063871008635033e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6818.8828125,
+      "completions/mean_terminated_length": 6430.056640625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.874519519507885,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006362155079841614,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 194007868.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009298324585,
+      "sampling/importance_sampling_ratio/min": 0.0005216691642999649,
+      "sampling/sampling_logp_difference/max": 7.55847692489624,
+      "sampling/sampling_logp_difference/mean": 0.01943325623869896,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 9.645911177358357e-06,
+      "clip_ratio/high_mean": 2.4114777943395893e-06,
+      "clip_ratio/low_mean": 6.821557258263056e-05,
+      "clip_ratio/low_min": 1.7265090718865395e-05,
+      "clip_ratio/region_mean": 7.062705049065698e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14536.0,
+      "completions/mean_length": 5515.625,
+      "completions/mean_terminated_length": 5343.111328125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0683523043990135,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003797185141593218,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "num_tokens": 194735980.0,
+      "reward": 0.421875,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 1.137102216830499e-07,
+      "sampling/sampling_logp_difference/max": 15.989612579345703,
+      "sampling/sampling_logp_difference/mean": 0.02120930328965187,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.1971412252241862e-05,
+      "clip_ratio/high_mean": 5.4928530630604655e-06,
+      "clip_ratio/low_mean": 4.9151800567415194e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4644653801005916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 5853.546875,
+      "completions/mean_terminated_length": 5770.6298828125,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.7975900694727898,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004124365746974945,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 195504882.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000672340393066,
+      "sampling/importance_sampling_ratio/min": 0.0032877910416573286,
+      "sampling/sampling_logp_difference/max": 5.717539310455322,
+      "sampling/sampling_logp_difference/mean": 0.017819223925471306,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 7.066538728395244e-06,
+      "clip_ratio/high_mean": 2.843255515472265e-06,
+      "clip_ratio/low_mean": 5.1467116236381116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.431037175185338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6686.25,
+      "completions/mean_terminated_length": 6532.31787109375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.9018580466508865,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024995009880512953,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 196379306.0,
+      "reward": 0.421875,
+      "reward_std": 0.35824593901634216,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999300837516785,
+      "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05,
+      "sampling/sampling_logp_difference/max": 10.818918228149414,
+      "sampling/sampling_logp_difference/mean": 0.018989525735378265,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 6.652828687947476e-06,
+      "clip_ratio/high_mean": 2.5722979444253724e-06,
+      "clip_ratio/low_mean": 3.699686294567073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95691608900961e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16347.0,
+      "completions/mean_length": 7487.3359375,
+      "completions/mean_terminated_length": 7200.3466796875,
+      "completions/min_length": 1222.0,
+      "completions/min_terminated_length": 1222.0,
+      "entropy": 0.9890001565217972,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004295211285352707,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 197357397.0,
+      "reward": 0.40625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0006548459641635418,
+      "sampling/sampling_logp_difference/max": 7.33111047744751,
+      "sampling/sampling_logp_difference/mean": 0.02209121733903885,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 6.0850939007650595e-06,
+      "clip_ratio/high_mean": 1.5212734751912649e-06,
+      "clip_ratio/low_mean": 2.9443070673096372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0964344205131056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7233.484375,
+      "completions/mean_terminated_length": 6938.30615234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.9683803990483284,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003119673579931259,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 198303795.0,
+      "reward": 0.328125,
+      "reward_std": 0.23014704883098602,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000243186950684,
+      "sampling/importance_sampling_ratio/min": 0.020358745008707047,
+      "sampling/sampling_logp_difference/max": 3.89424467086792,
+      "sampling/sampling_logp_difference/mean": 0.021085180342197418,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.963812095113099e-06,
+      "clip_ratio/high_mean": 1.9909530237782747e-06,
+      "clip_ratio/low_mean": 4.031422963635123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.23051826601295e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15733.0,
+      "completions/mean_length": 6457.78125,
+      "completions/mean_terminated_length": 6300.22265625,
+      "completions/min_length": 850.0,
+      "completions/min_terminated_length": 850.0,
+      "entropy": 0.8881053999066353,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033790848683565855,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 199154735.0,
+      "reward": 0.3828125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998799562454224,
+      "sampling/importance_sampling_ratio/min": 2.872048128210736e-07,
+      "sampling/sampling_logp_difference/max": 15.063070297241211,
+      "sampling/sampling_logp_difference/mean": 0.01950821653008461,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 9.059622016138746e-06,
+      "clip_ratio/high_mean": 3.3430123380639998e-06,
+      "clip_ratio/low_mean": 2.2856192117615137e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6199204512522556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 7904.40625,
+      "completions/mean_terminated_length": 7769.81005859375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9881557524204254,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021492803934961557,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 200185643.0,
+      "reward": 0.359375,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001094341278076,
+      "sampling/importance_sampling_ratio/min": 0.001458622980862856,
+      "sampling/sampling_logp_difference/max": 6.530262470245361,
+      "sampling/sampling_logp_difference/mean": 0.021201875060796738,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 6.9962839006620925e-06,
+      "clip_ratio/high_mean": 1.7490709751655231e-06,
+      "clip_ratio/low_mean": 3.018811844412994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193718976035598e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 7414.4921875,
+      "completions/mean_terminated_length": 7414.4921875,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "entropy": 0.9571134969592094,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037221095990389585,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 201153114.0,
+      "reward": 0.4375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999958872795105,
+      "sampling/importance_sampling_ratio/min": 0.0009130563121289015,
+      "sampling/sampling_logp_difference/max": 6.99871301651001,
+      "sampling/sampling_logp_difference/mean": 0.021356744691729546,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 1.1248092050664127e-05,
+      "clip_ratio/high_mean": 2.8120230126660317e-06,
+      "clip_ratio/low_mean": 5.4354991334548686e-05,
+      "clip_ratio/low_min": 6.868132004456129e-06,
+      "clip_ratio/region_mean": 5.716701480196207e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15835.0,
+      "completions/max_terminated_length": 15835.0,
+      "completions/mean_length": 5955.953125,
+      "completions/mean_terminated_length": 5955.953125,
+      "completions/min_length": 1394.0,
+      "completions/min_terminated_length": 1394.0,
+      "entropy": 0.730999618768692,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006285305600613356,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 201933044.0,
+      "reward": 0.59375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.007535050623118877,
+      "sampling/sampling_logp_difference/max": 4.888189792633057,
+      "sampling/sampling_logp_difference/mean": 0.016975615173578262,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 7.226686648209579e-06,
+      "clip_ratio/high_mean": 3.094216481258627e-06,
+      "clip_ratio/low_mean": 4.66828214484849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.977703792974353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6923.3515625,
+      "completions/mean_terminated_length": 6458.0732421875,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9938417226076126,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005667983554303646,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 202837281.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05,
+      "sampling/sampling_logp_difference/max": 10.402952194213867,
+      "sampling/sampling_logp_difference/mean": 0.022059854120016098,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 5.2318769121484365e-06,
+      "clip_ratio/high_mean": 1.3079692280371091e-06,
+      "clip_ratio/low_mean": 4.239228087499214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3700250216716086e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14726.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 5930.9296875,
+      "completions/mean_terminated_length": 5930.9296875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.8100385963916779,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004052883945405483,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 203614448.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999989926815033,
+      "sampling/importance_sampling_ratio/min": 0.00015170808183029294,
+      "sampling/sampling_logp_difference/max": 8.79355239868164,
+      "sampling/sampling_logp_difference/mean": 0.018519222736358643,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 4.905230980511988e-06,
+      "clip_ratio/high_mean": 1.226307745127997e-06,
+      "clip_ratio/low_mean": 5.500513248080097e-05,
+      "clip_ratio/low_min": 7.924934834591113e-06,
+      "clip_ratio/region_mean": 5.6231440112242126e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14996.0,
+      "completions/mean_length": 6911.1015625,
+      "completions/mean_terminated_length": 6108.3134765625,
+      "completions/min_length": 862.0,
+      "completions/min_terminated_length": 862.0,
+      "entropy": 0.9260227829217911,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004494607914239168,
+      "learning_rate": 1e-05,
+      "loss": 0.0269,
+      "num_tokens": 204518261.0,
+      "reward": 0.4140625,
+      "reward_std": 0.34033796191215515,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998886585235596,
+      "sampling/importance_sampling_ratio/min": 0.0015266009140759706,
+      "sampling/sampling_logp_difference/max": 6.484711647033691,
+      "sampling/sampling_logp_difference/mean": 0.020527629181742668,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 8.293764039990492e-06,
+      "clip_ratio/high_mean": 2.073441009997623e-06,
+      "clip_ratio/low_mean": 4.75325257411896e-05,
+      "clip_ratio/low_min": 3.599504680096288e-06,
+      "clip_ratio/region_mean": 4.960596663750039e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14637.0,
+      "completions/mean_length": 6972.921875,
+      "completions/mean_terminated_length": 6823.5400390625,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 1.0095533654093742,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029451537411659956,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 205433843.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05,
+      "sampling/sampling_logp_difference/max": 10.53177547454834,
+      "sampling/sampling_logp_difference/mean": 0.02013089321553707,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 4.163383164268453e-05,
+      "clip_ratio/high_mean": 1.382379150527413e-05,
+      "clip_ratio/low_mean": 3.86000854177837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2423876240936806e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 6706.6640625,
+      "completions/mean_terminated_length": 6313.2763671875,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 0.8647518903017044,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003371767932549119,
+      "learning_rate": 1e-05,
+      "loss": 0.073,
+      "num_tokens": 206310296.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 2.948181463580113e-05,
+      "sampling/sampling_logp_difference/max": 10.431736946105957,
+      "sampling/sampling_logp_difference/mean": 0.019770190119743347,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4946740381892596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4946740381892596e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 6882.609375,
+      "completions/mean_terminated_length": 6415.32763671875,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "entropy": 1.013342760503292,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016336971893906593,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 207210974.0,
+      "reward": 0.359375,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999210834503174,
+      "sampling/importance_sampling_ratio/min": 0.0013267879839986563,
+      "sampling/sampling_logp_difference/max": 6.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.02139991894364357,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.4866403944324702e-05,
+      "clip_ratio/high_mean": 3.7166009860811755e-06,
+      "clip_ratio/low_mean": 3.938925010515959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.310585177336179e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15203.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 6195.7421875,
+      "completions/mean_terminated_length": 6195.7421875,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.8448907434940338,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005036406684666872,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208021893.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955892562866,
+      "sampling/importance_sampling_ratio/min": 0.0040348549373447895,
+      "sampling/sampling_logp_difference/max": 5.512784957885742,
+      "sampling/sampling_logp_difference/mean": 0.018679853528738022,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.1244883353356272e-05,
+      "clip_ratio/high_mean": 2.811220838339068e-06,
+      "clip_ratio/low_mean": 3.422392001084518e-05,
+      "clip_ratio/low_min": 6.451612989621935e-06,
+      "clip_ratio/region_mean": 3.703514119024476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6829.609375,
+      "completions/mean_terminated_length": 6521.40283203125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.8679579794406891,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029643685556948185,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 208912059.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00038063788088038564,
+      "sampling/sampling_logp_difference/max": 7.873661994934082,
+      "sampling/sampling_logp_difference/mean": 0.018488366156816483,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 2.2700600311509334e-05,
+      "clip_ratio/high_mean": 5.675150077877333e-06,
+      "clip_ratio/low_mean": 3.138338854569156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.705853873725573e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14503.0,
+      "completions/max_terminated_length": 14503.0,
+      "completions/mean_length": 5444.4453125,
+      "completions/mean_terminated_length": 5444.4453125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0460086688399315,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035942886024713516,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "num_tokens": 209627804.0,
+      "reward": 0.484375,
+      "reward_std": 0.338498055934906,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997478723526,
+      "sampling/importance_sampling_ratio/min": 0.03179635480046272,
+      "sampling/sampling_logp_difference/max": 3.4484035968780518,
+      "sampling/sampling_logp_difference/mean": 0.020146891474723816,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.477029400120955e-05,
+      "clip_ratio/high_mean": 4.552578502625693e-06,
+      "clip_ratio/low_mean": 5.265122354103369e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.720380158891203e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16244.0,
+      "completions/mean_length": 7657.390625,
+      "completions/mean_terminated_length": 7152.544921875,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 0.9528728649020195,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0044983453117311,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 210630150.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000007152557373,
+      "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05,
+      "sampling/sampling_logp_difference/max": 10.158285140991211,
+      "sampling/sampling_logp_difference/mean": 0.02131088823080063,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 8.607642712377128e-06,
+      "clip_ratio/high_mean": 2.151910678094282e-06,
+      "clip_ratio/low_mean": 2.2759413695894182e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.491132454451872e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7574.3515625,
+      "completions/mean_terminated_length": 7504.984375,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 1.0009776800870895,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006095650140196085,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 211620355.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 0.0013946897815912962,
+      "sampling/sampling_logp_difference/max": 6.575083255767822,
+      "sampling/sampling_logp_difference/mean": 0.021727774292230606,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.764823082339717e-05,
+      "clip_ratio/high_mean": 5.141430960975413e-06,
+      "clip_ratio/low_mean": 5.936152001595474e-05,
+      "clip_ratio/low_min": 9.155588486464694e-06,
+      "clip_ratio/region_mean": 6.450295177273802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14915.0,
+      "completions/mean_length": 7919.6875,
+      "completions/mean_terminated_length": 7716.54443359375,
+      "completions/min_length": 1517.0,
+      "completions/min_terminated_length": 1517.0,
+      "entropy": 1.0405654236674309,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037038614973425865,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 212654747.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381899833679,
+      "sampling/importance_sampling_ratio/min": 0.0057550109922885895,
+      "sampling/sampling_logp_difference/max": 5.157684326171875,
+      "sampling/sampling_logp_difference/mean": 0.022051017731428146,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.265254240934155e-05,
+      "clip_ratio/high_mean": 3.1631356023353874e-06,
+      "clip_ratio/low_mean": 4.716233138424286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.032546687289141e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 8613.4765625,
+      "completions/mean_terminated_length": 7735.0693359375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.890489287674427,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325607368722558,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 213774584.0,
+      "reward": 0.40625,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000060796737671,
+      "sampling/importance_sampling_ratio/min": 1.670176425250247e-05,
+      "sampling/sampling_logp_difference/max": 10.999996185302734,
+      "sampling/sampling_logp_difference/mean": 0.020002499222755432,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.6404605503339553e-05,
+      "clip_ratio/high_mean": 4.101151375834888e-06,
+      "clip_ratio/low_mean": 3.880500707964529e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2906158682853857e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7324.8984375,
+      "completions/mean_terminated_length": 6473.1884765625,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 0.761004202067852,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038265211042016745,
+      "learning_rate": 1e-05,
+      "loss": 0.0717,
+      "num_tokens": 214728371.0,
+      "reward": 0.515625,
+      "reward_std": 0.32719239592552185,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000168085098267,
+      "sampling/importance_sampling_ratio/min": 0.0003049026126973331,
+      "sampling/sampling_logp_difference/max": 8.095518112182617,
+      "sampling/sampling_logp_difference/mean": 0.018367979675531387,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 5.624549885396846e-06,
+      "clip_ratio/high_mean": 1.4061374713492114e-06,
+      "clip_ratio/low_mean": 3.6433707123251224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7839844594600436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14167.0,
+      "completions/max_terminated_length": 14167.0,
+      "completions/mean_length": 6422.0859375,
+      "completions/mean_terminated_length": 6422.0859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.9946094751358032,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002729539293795824,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 215570806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.026308411732316017,
+      "sampling/sampling_logp_difference/max": 3.637866497039795,
+      "sampling/sampling_logp_difference/mean": 0.021903935819864273,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 7.2379848461423535e-06,
+      "clip_ratio/high_mean": 1.8094962115355884e-06,
+      "clip_ratio/low_mean": 3.17277934982485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353728982347093e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15585.0,
+      "completions/mean_length": 6845.2890625,
+      "completions/mean_terminated_length": 6693.88134765625,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8822609707713127,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004974282346665859,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 216465635.0,
+      "reward": 0.5390625,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 8.749838889343664e-05,
+      "sampling/sampling_logp_difference/max": 9.343890190124512,
+      "sampling/sampling_logp_difference/mean": 0.019389234483242035,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.58592818024772e-05,
+      "clip_ratio/high_mean": 3.9648204506193e-06,
+      "clip_ratio/low_mean": 4.096964960353944e-05,
+      "clip_ratio/low_min": 1.7403560605089297e-05,
+      "clip_ratio/region_mean": 4.49344687467601e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 7805.484375,
+      "completions/mean_terminated_length": 7528.7578125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.9977599084377289,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0033159854356199503,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 217485089.0,
+      "reward": 0.421875,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999412298202515,
+      "sampling/importance_sampling_ratio/min": 7.967943383846432e-05,
+      "sampling/sampling_logp_difference/max": 9.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.021925684064626694,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.8265397557115648e-05,
+      "clip_ratio/high_mean": 4.566349389278912e-06,
+      "clip_ratio/low_mean": 4.044636898470344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5012717691861326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15681.0,
+      "completions/mean_length": 7737.5546875,
+      "completions/mean_terminated_length": 7530.04052734375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.8667014688253403,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034952745772898197,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "num_tokens": 218496040.0,
+      "reward": 0.453125,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999128580093384,
+      "sampling/importance_sampling_ratio/min": 6.726370338583365e-05,
+      "sampling/sampling_logp_difference/max": 9.606889724731445,
+      "sampling/sampling_logp_difference/mean": 0.019742710515856743,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 8.244294804171659e-06,
+      "clip_ratio/high_mean": 2.0610737010429148e-06,
+      "clip_ratio/low_mean": 3.204250072030845e-05,
+      "clip_ratio/low_min": 3.323495775475749e-06,
+      "clip_ratio/region_mean": 3.410357436450795e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15858.0,
+      "completions/mean_length": 7365.84375,
+      "completions/mean_terminated_length": 6601.59326171875,
+      "completions/min_length": 744.0,
+      "completions/min_terminated_length": 744.0,
+      "entropy": 0.8151945173740387,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038676802068948746,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 219459140.0,
+      "reward": 0.46875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00023387260443996638,
+      "sampling/sampling_logp_difference/max": 8.360733985900879,
+      "sampling/sampling_logp_difference/mean": 0.018882082775235176,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 6.87833608026267e-06,
+      "clip_ratio/high_mean": 2.9462287329806713e-06,
+      "clip_ratio/low_mean": 5.435333650893881e-05,
+      "clip_ratio/low_min": 5.33937054569833e-06,
+      "clip_ratio/region_mean": 5.729956546929316e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 6448.0078125,
+      "completions/mean_terminated_length": 6369.771484375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9546648040413857,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004310046322643757,
+      "learning_rate": 1e-05,
+      "loss": 0.1082,
+      "num_tokens": 220304605.0,
+      "reward": 0.5703125,
+      "reward_std": 0.35611939430236816,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999396800994873,
+      "sampling/importance_sampling_ratio/min": 0.0001234127557836473,
+      "sampling/sampling_logp_difference/max": 8.99997615814209,
+      "sampling/sampling_logp_difference/mean": 0.020253397524356842,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 6.196094091137638e-06,
+      "clip_ratio/high_mean": 1.5490235227844096e-06,
+      "clip_ratio/low_mean": 2.5416685957679874e-05,
+      "clip_ratio/low_min": 5.5736391004757024e-06,
+      "clip_ratio/region_mean": 2.696570959415112e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 7457.6484375,
+      "completions/mean_terminated_length": 6941.24755859375,
+      "completions/min_length": 604.0,
+      "completions/min_terminated_length": 604.0,
+      "entropy": 0.8182889074087143,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026646999176591635,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 221281968.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2012200653553009,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173283576965,
+      "sampling/importance_sampling_ratio/min": 2.902353571698768e-06,
+      "sampling/sampling_logp_difference/max": 12.749988555908203,
+      "sampling/sampling_logp_difference/mean": 0.019208962097764015,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 1.6189535017474554e-05,
+      "clip_ratio/high_mean": 4.047383754368639e-06,
+      "clip_ratio/low_mean": 3.127787306311802e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.532525670379982e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 8561.109375,
+      "completions/mean_terminated_length": 7969.79052734375,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.9581378549337387,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016026750672608614,
+      "learning_rate": 1e-05,
+      "loss": 0.0131,
+      "num_tokens": 222399046.0,
+      "reward": 0.34375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 1.653693971093162e-06,
+      "sampling/sampling_logp_difference/max": 13.312499046325684,
+      "sampling/sampling_logp_difference/mean": 0.02173236384987831,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 1.4200771602190798e-05,
+      "clip_ratio/high_mean": 4.3255887476334465e-06,
+      "clip_ratio/low_mean": 5.2955770115659107e-05,
+      "clip_ratio/low_min": 3.402656830076012e-06,
+      "clip_ratio/region_mean": 5.7281358749605715e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16239.0,
+      "completions/mean_length": 7152.34375,
+      "completions/mean_terminated_length": 7079.6533203125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9052041247487068,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005460259038954973,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 223335010.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3356297016143799,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999966621398926,
+      "sampling/importance_sampling_ratio/min": 0.010161337442696095,
+      "sampling/sampling_logp_difference/max": 4.589165210723877,
+      "sampling/sampling_logp_difference/mean": 0.01986619457602501,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 1.4350314813782461e-05,
+      "clip_ratio/high_mean": 3.5875787034456152e-06,
+      "clip_ratio/low_mean": 3.81288905373367e-05,
+      "clip_ratio/low_min": 8.099272235995159e-06,
+      "clip_ratio/region_mean": 4.1716469809216505e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 6678.65625,
+      "completions/mean_terminated_length": 6524.603515625,
+      "completions/min_length": 963.0,
+      "completions/min_terminated_length": 963.0,
+      "entropy": 0.9043187350034714,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005933742038905621,
+      "learning_rate": 1e-05,
+      "loss": 0.0966,
+      "num_tokens": 224207006.0,
+      "reward": 0.484375,
+      "reward_std": 0.3316681981086731,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000031590461731,
+      "sampling/importance_sampling_ratio/min": 0.0011734943836927414,
+      "sampling/sampling_logp_difference/max": 6.747769355773926,
+      "sampling/sampling_logp_difference/mean": 0.019827336072921753,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 1.6498819377375185e-05,
+      "clip_ratio/high_mean": 4.124704844343796e-06,
+      "clip_ratio/low_mean": 3.601791678420341e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.014262168539062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6999.0390625,
+      "completions/mean_terminated_length": 6850.07177734375,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8109970837831497,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003635740838944912,
+      "learning_rate": 1e-05,
+      "loss": 0.104,
+      "num_tokens": 225122891.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999303817749023,
+      "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05,
+      "sampling/sampling_logp_difference/max": 10.987512588500977,
+      "sampling/sampling_logp_difference/mean": 0.018912551924586296,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 9.527577958579059e-06,
+      "clip_ratio/high_mean": 2.3818944896447647e-06,
+      "clip_ratio/low_mean": 3.766565987461945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.004755419373396e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15713.0,
+      "completions/mean_length": 7483.7109375,
+      "completions/mean_terminated_length": 7045.9912109375,
+      "completions/min_length": 1153.0,
+      "completions/min_terminated_length": 1153.0,
+      "entropy": 0.9473970532417297,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003405241761356592,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 226102462.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00002920627594,
+      "sampling/importance_sampling_ratio/min": 0.00525119062513113,
+      "sampling/sampling_logp_difference/max": 5.249300479888916,
+      "sampling/sampling_logp_difference/mean": 0.021076779812574387,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.5867321963014547e-05,
+      "clip_ratio/high_mean": 3.966830490753637e-06,
+      "clip_ratio/low_mean": 3.8259706570897833e-05,
+      "clip_ratio/low_min": 3.549019083948224e-06,
+      "clip_ratio/region_mean": 4.2226537743772496e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 7569.03125,
+      "completions/mean_terminated_length": 7357.47216796875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9231455475091934,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025927501264959574,
+      "learning_rate": 1e-05,
+      "loss": 0.0801,
+      "num_tokens": 227093562.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19097033143043518,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0052477638237178326,
+      "sampling/sampling_logp_difference/max": 5.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.020578444004058838,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.344091060673236e-05,
+      "clip_ratio/high_mean": 3.36022765168309e-06,
+      "clip_ratio/low_mean": 4.253613235505327e-05,
+      "clip_ratio/low_min": 3.5579084851633525e-06,
+      "clip_ratio/region_mean": 4.5896360120423196e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 7589.2734375,
+      "completions/mean_terminated_length": 7378.2001953125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9265239909291267,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030512227676808834,
+      "learning_rate": 1e-05,
+      "loss": 0.04,
+      "num_tokens": 228086405.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0002165911573683843,
+      "sampling/sampling_logp_difference/max": 8.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.020208362489938736,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.9613525410022703e-05,
+      "clip_ratio/high_mean": 4.903381352505676e-06,
+      "clip_ratio/low_mean": 3.184792547017423e-05,
+      "clip_ratio/low_min": 7.29296516510658e-06,
+      "clip_ratio/region_mean": 3.675130722058384e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 8420.6875,
+      "completions/mean_terminated_length": 8096.97509765625,
+      "completions/min_length": 1114.0,
+      "completions/min_terminated_length": 1114.0,
+      "entropy": 0.9572964608669281,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022430522367358208,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 229183765.0,
+      "reward": 0.34375,
+      "reward_std": 0.309583842754364,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 0.00029693738906644285,
+      "sampling/sampling_logp_difference/max": 8.121989250183105,
+      "sampling/sampling_logp_difference/mean": 0.021570362150669098,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.728750577167375e-06,
+      "clip_ratio/high_mean": 1.6821876442918438e-06,
+      "clip_ratio/low_mean": 2.1682553096979973e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.336474062758498e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15736.0,
+      "completions/mean_length": 6809.765625,
+      "completions/mean_terminated_length": 6579.984375,
+      "completions/min_length": 860.0,
+      "completions/min_terminated_length": 860.0,
+      "entropy": 0.884086549282074,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004295065999031067,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 230077607.0,
+      "reward": 0.484375,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00754612497985363,
+      "sampling/sampling_logp_difference/max": 4.886721134185791,
+      "sampling/sampling_logp_difference/mean": 0.019895706325769424,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 2.8609347509700456e-05,
+      "clip_ratio/high_mean": 7.152336877425114e-06,
+      "clip_ratio/low_mean": 5.158006410965754e-05,
+      "clip_ratio/low_min": 5.210069957684027e-06,
+      "clip_ratio/region_mean": 5.873240070286556e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15080.0,
+      "completions/mean_length": 7340.6953125,
+      "completions/mean_terminated_length": 6973.0810546875,
+      "completions/min_length": 1616.0,
+      "completions/min_terminated_length": 1616.0,
+      "entropy": 0.9920620769262314,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004631794057786465,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 231035616.0,
+      "reward": 0.4375,
+      "reward_std": 0.3235401213169098,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337792396545,
+      "sampling/importance_sampling_ratio/min": 0.0002508950710762292,
+      "sampling/sampling_logp_difference/max": 8.290475845336914,
+      "sampling/sampling_logp_difference/mean": 0.020591016858816147,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.3085940774290066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3085940774290066e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14120.0,
+      "completions/mean_length": 6748.875,
+      "completions/mean_terminated_length": 6595.93701171875,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.9867061004042625,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035752104595303535,
+      "learning_rate": 1e-05,
+      "loss": 0.0455,
+      "num_tokens": 231920056.0,
+      "reward": 0.40625,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999653100967407,
+      "sampling/importance_sampling_ratio/min": 0.0003869794018100947,
+      "sampling/sampling_logp_difference/max": 7.8571391105651855,
+      "sampling/sampling_logp_difference/mean": 0.02061416581273079,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 1.2506750408647349e-05,
+      "clip_ratio/high_mean": 3.1266876021618373e-06,
+      "clip_ratio/low_mean": 3.10397430212106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.416643085074611e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 7260.3046875,
+      "completions/mean_terminated_length": 7188.46435546875,
+      "completions/min_length": 1384.0,
+      "completions/min_terminated_length": 1384.0,
+      "entropy": 1.0388494208455086,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036644963547587395,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 232869159.0,
+      "reward": 0.390625,
+      "reward_std": 0.2359209954738617,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999546408653259,
+      "sampling/importance_sampling_ratio/min": 0.0008660226594656706,
+      "sampling/sampling_logp_difference/max": 7.051599502563477,
+      "sampling/sampling_logp_difference/mean": 0.02120530977845192,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.704355301830219e-05,
+      "clip_ratio/high_mean": 6.760888254575548e-06,
+      "clip_ratio/low_mean": 3.1861192269388994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.862208097871189e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16073.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 6354.4609375,
+      "completions/mean_terminated_length": 6354.4609375,
+      "completions/min_length": 1035.0,
+      "completions/min_terminated_length": 1035.0,
+      "entropy": 0.8405331820249557,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004709267523139715,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 233702842.0,
+      "reward": 0.546875,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 0.0046309432946145535,
+      "sampling/sampling_logp_difference/max": 5.37499475479126,
+      "sampling/sampling_logp_difference/mean": 0.019126038998365402,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 9.749228638611385e-06,
+      "clip_ratio/high_mean": 2.437307159652846e-06,
+      "clip_ratio/low_mean": 3.855073941849696e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.098804652130639e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16026.0,
+      "completions/mean_length": 6514.578125,
+      "completions/mean_terminated_length": 6357.9208984375,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "entropy": 1.0254098922014236,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003066045930609107,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 234556348.0,
+      "reward": 0.4375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805092811584,
+      "sampling/importance_sampling_ratio/min": 0.005210204049944878,
+      "sampling/sampling_logp_difference/max": 5.257136344909668,
+      "sampling/sampling_logp_difference/mean": 0.019960148259997368,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.0475813724042382e-05,
+      "clip_ratio/high_mean": 2.6189534310105955e-06,
+      "clip_ratio/low_mean": 3.487835761006863e-05,
+      "clip_ratio/low_min": 2.9392399483185727e-06,
+      "clip_ratio/region_mean": 3.749731081370555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 7379.5546875,
+      "completions/mean_terminated_length": 7236.62744140625,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 1.0397320613265038,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005132520105689764,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 235521091.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519364118576,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999256134033203,
+      "sampling/importance_sampling_ratio/min": 0.00016659013635944575,
+      "sampling/sampling_logp_difference/max": 8.699974060058594,
+      "sampling/sampling_logp_difference/mean": 0.021417103707790375,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.9904123973901733e-05,
+      "clip_ratio/high_mean": 5.776861314643611e-06,
+      "clip_ratio/low_mean": 2.6659268655748747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2436129686175263e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14565.0,
+      "completions/mean_length": 7837.1640625,
+      "completions/mean_terminated_length": 7632.04052734375,
+      "completions/min_length": 1346.0,
+      "completions/min_terminated_length": 1346.0,
+      "entropy": 0.8400963917374611,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028969801496714354,
+      "learning_rate": 1e-05,
+      "loss": 0.0143,
+      "num_tokens": 236544160.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887943267822,
+      "sampling/importance_sampling_ratio/min": 2.883308241052873e-07,
+      "sampling/sampling_logp_difference/max": 15.059157371520996,
+      "sampling/sampling_logp_difference/mean": 0.019267702475190163,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 8.562770290154731e-06,
+      "clip_ratio/high_mean": 2.1406925725386827e-06,
+      "clip_ratio/low_mean": 4.060094340729847e-05,
+      "clip_ratio/low_min": 3.8700886761944275e-06,
+      "clip_ratio/region_mean": 4.2741635979837156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15350.0,
+      "completions/mean_length": 6696.3515625,
+      "completions/mean_terminated_length": 6542.57958984375,
+      "completions/min_length": 1239.0,
+      "completions/min_terminated_length": 1239.0,
+      "entropy": 0.8495818004012108,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003412836929783225,
+      "learning_rate": 1e-05,
+      "loss": 0.0803,
+      "num_tokens": 237423101.0,
+      "reward": 0.515625,
+      "reward_std": 0.37981897592544556,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.012152798473834991,
+      "sampling/sampling_logp_difference/max": 4.410195827484131,
+      "sampling/sampling_logp_difference/mean": 0.018458625301718712,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 1.1463653436294408e-05,
+      "clip_ratio/high_mean": 3.646129641765583e-06,
+      "clip_ratio/low_mean": 6.144847083078275e-05,
+      "clip_ratio/low_min": 1.110105540647055e-05,
+      "clip_ratio/region_mean": 6.509460160941671e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15666.0,
+      "completions/mean_length": 7700.3671875,
+      "completions/mean_terminated_length": 7121.45849609375,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "entropy": 0.8258870914578438,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024443145375698805,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 238429956.0,
+      "reward": 0.375,
+      "reward_std": 0.2872493863105774,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999113082885742,
+      "sampling/importance_sampling_ratio/min": 0.00026112530031241477,
+      "sampling/sampling_logp_difference/max": 8.250510215759277,
+      "sampling/sampling_logp_difference/mean": 0.019427984952926636,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 4.218127742205979e-06,
+      "clip_ratio/high_mean": 1.0545319355514948e-06,
+      "clip_ratio/low_mean": 1.7289162997258245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.834369493280974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16112.0,
+      "completions/mean_length": 6255.21875,
+      "completions/mean_terminated_length": 6094.44482421875,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.8179014846682549,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022747826296836138,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 239250160.0,
+      "reward": 0.5234375,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.0002633975527714938,
+      "sampling/sampling_logp_difference/max": 8.241846084594727,
+      "sampling/sampling_logp_difference/mean": 0.018723051995038986,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 1.698448841125355e-05,
+      "clip_ratio/high_mean": 5.369374321162468e-06,
+      "clip_ratio/low_mean": 6.14647315160255e-05,
+      "clip_ratio/low_min": 5.043576493335422e-06,
+      "clip_ratio/region_mean": 6.683410583718796e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15321.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6914.9609375,
+      "completions/mean_terminated_length": 6914.9609375,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9700981751084328,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005685295443981886,
+      "learning_rate": 1e-05,
+      "loss": -0.0056,
+      "num_tokens": 240156211.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998887777328491,
+      "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05,
+      "sampling/sampling_logp_difference/max": 9.997581481933594,
+      "sampling/sampling_logp_difference/mean": 0.021195171400904655,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9186837764427764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9186837764427764e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15469.0,
+      "completions/mean_length": 5227.53125,
+      "completions/mean_terminated_length": 5139.68505859375,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.9116031974554062,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003880272386595607,
+      "learning_rate": 1e-05,
+      "loss": 0.1246,
+      "num_tokens": 240845295.0,
+      "reward": 0.6328125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000362396240234,
+      "sampling/importance_sampling_ratio/min": 0.00012422871077433228,
+      "sampling/sampling_logp_difference/max": 8.993386268615723,
+      "sampling/sampling_logp_difference/mean": 0.018801718950271606,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 2.5015486926349695e-05,
+      "clip_ratio/high_mean": 8.084949570275057e-06,
+      "clip_ratio/low_mean": 5.524710468307603e-05,
+      "clip_ratio/low_min": 3.776891389861703e-06,
+      "clip_ratio/region_mean": 6.333205465125502e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 8065.4765625,
+      "completions/mean_terminated_length": 7510.90869140625,
+      "completions/min_length": 1055.0,
+      "completions/min_terminated_length": 1055.0,
+      "entropy": 0.7446574792265892,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028986844699829817,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 241895676.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3474721610546112,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999842643737793,
+      "sampling/importance_sampling_ratio/min": 0.0017039099475368857,
+      "sampling/sampling_logp_difference/max": 6.3748297691345215,
+      "sampling/sampling_logp_difference/mean": 0.01853121444582939,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 9.486341014053323e-06,
+      "clip_ratio/high_mean": 2.371585253513331e-06,
+      "clip_ratio/low_mean": 2.896106741445692e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.133265261112683e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15534.0,
+      "completions/max_terminated_length": 15534.0,
+      "completions/mean_length": 6127.359375,
+      "completions/mean_terminated_length": 6127.359375,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.8569132760167122,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003845847910270095,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 242698258.0,
+      "reward": 0.53125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000942945480347,
+      "sampling/importance_sampling_ratio/min": 0.00043231461313553154,
+      "sampling/sampling_logp_difference/max": 7.746356964111328,
+      "sampling/sampling_logp_difference/mean": 0.01856958493590355,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 2.9848330086679198e-05,
+      "clip_ratio/high_mean": 7.4620825216697995e-06,
+      "clip_ratio/low_mean": 4.3558867673709756e-05,
+      "clip_ratio/low_min": 4.417741820361698e-06,
+      "clip_ratio/region_mean": 5.1020949285884853e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15192.0,
+      "completions/mean_length": 6600.1484375,
+      "completions/mean_terminated_length": 6365.33642578125,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.78924310952425,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003953634761273861,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 243560957.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.0006525487406179309,
+      "sampling/sampling_logp_difference/max": 7.334624767303467,
+      "sampling/sampling_logp_difference/mean": 0.018097909167408943,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 6.635561703660642e-06,
+      "clip_ratio/high_mean": 1.6588904259151604e-06,
+      "clip_ratio/low_mean": 2.737523408313791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9034124281679397e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7852.171875,
+      "completions/mean_terminated_length": 7852.171875,
+      "completions/min_length": 1276.0,
+      "completions/min_terminated_length": 1276.0,
+      "entropy": 1.0598893761634827,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00360781978815794,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 244585923.0,
+      "reward": 0.3125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05,
+      "sampling/sampling_logp_difference/max": 10.076086044311523,
+      "sampling/sampling_logp_difference/mean": 0.022330068051815033,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 3.1540168947685743e-06,
+      "clip_ratio/high_mean": 7.885042236921436e-07,
+      "clip_ratio/low_mean": 4.7973388973332476e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.876189268543385e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7972.2265625,
+      "completions/mean_terminated_length": 7700.87890625,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.933217465877533,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0027661293279379606,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 245628064.0,
+      "reward": 0.28125,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05,
+      "sampling/sampling_logp_difference/max": 10.366576194763184,
+      "sampling/sampling_logp_difference/mean": 0.021125148981809616,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.2965969062861404e-05,
+      "clip_ratio/high_mean": 3.241492265715351e-06,
+      "clip_ratio/low_mean": 4.6317693090713874e-05,
+      "clip_ratio/low_min": 3.820877282123547e-06,
+      "clip_ratio/region_mean": 4.955918507221213e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15744.0,
+      "completions/mean_length": 7135.6953125,
+      "completions/mean_terminated_length": 6913.736328125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 0.7786942347884178,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005680318456143141,
+      "learning_rate": 1e-05,
+      "loss": 0.0786,
+      "num_tokens": 246561329.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999462366104126,
+      "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05,
+      "sampling/sampling_logp_difference/max": 9.737424850463867,
+      "sampling/sampling_logp_difference/mean": 0.018504241481423378,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.22437145175536e-05,
+      "clip_ratio/low_min": 1.4025082009538892e-05,
+      "clip_ratio/region_mean": 4.22437145175536e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 6704.046875,
+      "completions/mean_terminated_length": 6627.82666015625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "entropy": 1.0435140281915665,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026402862276881933,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 247437415.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31276631355285645,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998904466629028,
+      "sampling/importance_sampling_ratio/min": 0.0007800163584761322,
+      "sampling/sampling_logp_difference/max": 7.156195640563965,
+      "sampling/sampling_logp_difference/mean": 0.02134273201227188,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 2.223430897174694e-05,
+      "clip_ratio/high_mean": 6.8746438159905665e-06,
+      "clip_ratio/low_mean": 4.7084630978133646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3959275192028144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 5892.5078125,
+      "completions/mean_terminated_length": 5725.9765625,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "entropy": 0.8004944771528244,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003993614576756954,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 248211112.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0024652592837810516,
+      "sampling/sampling_logp_difference/max": 6.005458354949951,
+      "sampling/sampling_logp_difference/mean": 0.01924925297498703,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 2.1833082200828358e-05,
+      "clip_ratio/high_mean": 5.458270550207089e-06,
+      "clip_ratio/low_mean": 3.415995615796419e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961822596920683e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 7812.140625,
+      "completions/mean_terminated_length": 7316.24755859375,
+      "completions/min_length": 1515.0,
+      "completions/min_terminated_length": 1515.0,
+      "entropy": 0.8841542899608612,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001573400106281042,
+      "learning_rate": 1e-05,
+      "loss": 0.0823,
+      "num_tokens": 249228106.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 0.001001527882181108,
+      "sampling/sampling_logp_difference/max": 6.906228542327881,
+      "sampling/sampling_logp_difference/mean": 0.01956877112388611,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 1.014439021673752e-05,
+      "clip_ratio/high_mean": 2.53609755418438e-06,
+      "clip_ratio/low_mean": 3.068193461785995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.321803217204433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 6372.953125,
+      "completions/mean_terminated_length": 6132.6884765625,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.8228401988744736,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021125099156051874,
+      "learning_rate": 1e-05,
+      "loss": 0.0438,
+      "num_tokens": 250063284.0,
+      "reward": 0.5,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05,
+      "sampling/sampling_logp_difference/max": 9.937475204467773,
+      "sampling/sampling_logp_difference/mean": 0.01943521574139595,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 7.023906164249638e-06,
+      "clip_ratio/high_mean": 1.7559765410624095e-06,
+      "clip_ratio/low_mean": 2.526416994896863e-05,
+      "clip_ratio/low_min": 6.7760895490209805e-06,
+      "clip_ratio/region_mean": 2.7020146660561295e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16270.0,
+      "completions/mean_length": 7817.8671875,
+      "completions/mean_terminated_length": 7396.58154296875,
+      "completions/min_length": 1568.0,
+      "completions/min_terminated_length": 1568.0,
+      "entropy": 0.9454319775104523,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022315154783427715,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 251085123.0,
+      "reward": 0.40625,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06,
+      "sampling/sampling_logp_difference/max": 12.760490417480469,
+      "sampling/sampling_logp_difference/mean": 0.021764669567346573,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 1.4797966287005693e-05,
+      "clip_ratio/high_mean": 3.699491571751423e-06,
+      "clip_ratio/low_mean": 4.36271948274225e-05,
+      "clip_ratio/low_min": 3.6957101201551268e-06,
+      "clip_ratio/region_mean": 4.732668639917392e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 7168.4921875,
+      "completions/mean_terminated_length": 6635.36328125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8433891162276268,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 252020906.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589920043945,
+      "sampling/importance_sampling_ratio/min": 0.0003851866349577904,
+      "sampling/sampling_logp_difference/max": 7.861782550811768,
+      "sampling/sampling_logp_difference/mean": 0.01929781585931778,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 1.996871560550062e-05,
+      "clip_ratio/high_mean": 6.089093403716106e-06,
+      "clip_ratio/low_mean": 4.2792244585143635e-05,
+      "clip_ratio/low_min": 1.0337215371691855e-05,
+      "clip_ratio/region_mean": 4.8881338216233416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7322.5078125,
+      "completions/mean_terminated_length": 6876.8603515625,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 0.9157031401991844,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036942458245903254,
+      "learning_rate": 1e-05,
+      "loss": 0.079,
+      "num_tokens": 252977435.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24275577068328857,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999804496765137,
+      "sampling/importance_sampling_ratio/min": 0.00029605376766994596,
+      "sampling/sampling_logp_difference/max": 8.124969482421875,
+      "sampling/sampling_logp_difference/mean": 0.0205365102738142,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.631919460327481e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.631919460327481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16078.0,
+      "completions/mean_length": 7025.484375,
+      "completions/mean_terminated_length": 6723.5966796875,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 1.1329731941223145,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034127074759453535,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 253896161.0,
+      "reward": 0.25,
+      "reward_std": 0.27722424268722534,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0005197672289796174,
+      "sampling/sampling_logp_difference/max": 7.562129497528076,
+      "sampling/sampling_logp_difference/mean": 0.023741140961647034,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 4.368643658381188e-06,
+      "clip_ratio/high_mean": 1.092160914595297e-06,
+      "clip_ratio/low_mean": 2.4661783299961826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5753944555617636e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13776.0,
+      "completions/mean_length": 5996.1796875,
+      "completions/mean_terminated_length": 5661.08837890625,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8773328885436058,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003959407564252615,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 254690264.0,
+      "reward": 0.53125,
+      "reward_std": 0.26645541191101074,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07,
+      "sampling/sampling_logp_difference/max": 15.73043155670166,
+      "sampling/sampling_logp_difference/mean": 0.018407585099339485,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.616483677935321e-05,
+      "clip_ratio/high_mean": 4.041209194838302e-06,
+      "clip_ratio/low_mean": 3.736187466074625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140308453770558e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16383.0,
+      "completions/mean_length": 7165.328125,
+      "completions/mean_terminated_length": 6867.951171875,
+      "completions/min_length": 1115.0,
+      "completions/min_terminated_length": 1115.0,
+      "entropy": 0.9502597972750664,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030910037457942963,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 255626394.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000731945037842,
+      "sampling/importance_sampling_ratio/min": 0.00022311302018351853,
+      "sampling/sampling_logp_difference/max": 8.407832145690918,
+      "sampling/sampling_logp_difference/mean": 0.020668907091021538,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.1702686606440693e-05,
+      "clip_ratio/high_mean": 2.9256716516101733e-06,
+      "clip_ratio/low_mean": 5.5247357522603124e-05,
+      "clip_ratio/low_min": 3.6811261452385224e-06,
+      "clip_ratio/region_mean": 5.8173028264718596e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15375.0,
+      "completions/mean_length": 8001.9296875,
+      "completions/mean_terminated_length": 7661.34912109375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "entropy": 0.8591345250606537,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037233952898532152,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 256673457.0,
+      "reward": 0.421875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999151229858398,
+      "sampling/importance_sampling_ratio/min": 0.0021876997780054808,
+      "sampling/sampling_logp_difference/max": 6.124904632568359,
+      "sampling/sampling_logp_difference/mean": 0.020540472120046616,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 3.721341136042611e-05,
+      "clip_ratio/high_mean": 1.2759249216287571e-05,
+      "clip_ratio/low_mean": 3.570647322703735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.846572301175911e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 6924.84375,
+      "completions/mean_terminated_length": 6697.82421875,
+      "completions/min_length": 803.0,
+      "completions/min_terminated_length": 803.0,
+      "entropy": 0.7969356626272202,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006054217461496592,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 257578501.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.007889713160693645,
+      "sampling/sampling_logp_difference/max": 4.842195510864258,
+      "sampling/sampling_logp_difference/mean": 0.019306108355522156,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.0211543894911301e-05,
+      "clip_ratio/high_mean": 2.5528859737278253e-06,
+      "clip_ratio/low_mean": 5.2388056587915344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4940942732173426e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14439.0,
+      "completions/mean_length": 6203.03125,
+      "completions/mean_terminated_length": 5958.6884765625,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "entropy": 0.8734413683414459,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004903806839138269,
+      "learning_rate": 1e-05,
+      "loss": 0.0689,
+      "num_tokens": 258392625.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999826550483704,
+      "sampling/importance_sampling_ratio/min": 0.00020370795391499996,
+      "sampling/sampling_logp_difference/max": 8.498823165893555,
+      "sampling/sampling_logp_difference/mean": 0.01909301057457924,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.5135058674786706e-05,
+      "clip_ratio/high_mean": 4.64845766146027e-06,
+      "clip_ratio/low_mean": 4.373456977191381e-05,
+      "clip_ratio/low_min": 3.670856358439778e-06,
+      "clip_ratio/region_mean": 4.8383026296505705e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 7982.5390625,
+      "completions/mean_terminated_length": 7641.01611328125,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0091779381036758,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033637424930930138,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 259435270.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999765753746033,
+      "sampling/importance_sampling_ratio/min": 0.0016514655435457826,
+      "sampling/sampling_logp_difference/max": 6.406092166900635,
+      "sampling/sampling_logp_difference/mean": 0.02182736061513424,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 2.3964702677403693e-05,
+      "clip_ratio/high_mean": 5.991175669350923e-06,
+      "clip_ratio/low_mean": 5.2442986770984135e-05,
+      "clip_ratio/low_min": 8.75736759553547e-06,
+      "clip_ratio/region_mean": 5.843416238349164e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6915.3125,
+      "completions/mean_terminated_length": 6688.064453125,
+      "completions/min_length": 778.0,
+      "completions/min_terminated_length": 778.0,
+      "entropy": 0.7964543774724007,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052203768864274025,
+      "learning_rate": 1e-05,
+      "loss": 0.144,
+      "num_tokens": 260337614.0,
+      "reward": 0.46875,
+      "reward_std": 0.37928223609924316,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 7.032832218101248e-05,
+      "sampling/sampling_logp_difference/max": 9.562335968017578,
+      "sampling/sampling_logp_difference/mean": 0.017896221950650215,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 4.458271632756805e-05,
+      "clip_ratio/high_mean": 1.1145679081892013e-05,
+      "clip_ratio/low_mean": 6.243192206056847e-05,
+      "clip_ratio/low_min": 1.2397775662975619e-05,
+      "clip_ratio/region_mean": 7.357759886872373e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7029.4375,
+      "completions/mean_terminated_length": 6880.95263671875,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "entropy": 0.8605096861720085,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005570738110691309,
+      "learning_rate": 1e-05,
+      "loss": 0.0984,
+      "num_tokens": 261254070.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3327290117740631,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999494552612305,
+      "sampling/importance_sampling_ratio/min": 0.0009070249507203698,
+      "sampling/sampling_logp_difference/max": 7.005340576171875,
+      "sampling/sampling_logp_difference/mean": 0.01905740052461624,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 3.390461233720998e-05,
+      "clip_ratio/high_mean": 1.1191766247975465e-05,
+      "clip_ratio/low_mean": 7.46641262594494e-05,
+      "clip_ratio/low_min": 5.041745680500753e-06,
+      "clip_ratio/region_mean": 8.585589102949598e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5858.84375,
+      "completions/mean_terminated_length": 5606.240234375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.8430554121732712,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004496110137552023,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 262024906.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294877052307,
+      "sampling/importance_sampling_ratio/min": 0.00040469475788995624,
+      "sampling/sampling_logp_difference/max": 7.812377452850342,
+      "sampling/sampling_logp_difference/mean": 0.019225869327783585,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 3.2563955301156966e-06,
+      "clip_ratio/high_mean": 8.140988825289242e-07,
+      "clip_ratio/low_mean": 3.7080020149460324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.789411886145899e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15976.0,
+      "completions/mean_length": 8337.328125,
+      "completions/mean_terminated_length": 7728.7568359375,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.901745393872261,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00348713924176991,
+      "learning_rate": 1e-05,
+      "loss": -0.0002,
+      "num_tokens": 263110844.0,
+      "reward": 0.296875,
+      "reward_std": 0.20805485546588898,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998900890350342,
+      "sampling/importance_sampling_ratio/min": 0.0022652465850114822,
+      "sampling/sampling_logp_difference/max": 6.090071678161621,
+      "sampling/sampling_logp_difference/mean": 0.02157524600625038,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 2.3739744847262045e-05,
+      "clip_ratio/high_mean": 5.934936211815511e-06,
+      "clip_ratio/low_mean": 2.823553325015382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.417046866616147e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 7084.7265625,
+      "completions/mean_terminated_length": 6381.42041015625,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8265534415841103,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003980033565312624,
+      "learning_rate": 1e-05,
+      "loss": 0.0551,
+      "num_tokens": 264036169.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673366546631,
+      "sampling/importance_sampling_ratio/min": 0.00012345099821686745,
+      "sampling/sampling_logp_difference/max": 8.999666213989258,
+      "sampling/sampling_logp_difference/mean": 0.018782664090394974,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 1.1745505617000163e-05,
+      "clip_ratio/high_mean": 3.771558226617344e-06,
+      "clip_ratio/low_mean": 6.913120819262986e-05,
+      "clip_ratio/low_min": 2.494283216947224e-05,
+      "clip_ratio/region_mean": 7.290276607818669e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6543.796875,
+      "completions/mean_terminated_length": 6543.796875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8899869695305824,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.006467343773692846,
+      "learning_rate": 1e-05,
+      "loss": 0.1139,
+      "num_tokens": 264892767.0,
+      "reward": 0.484375,
+      "reward_std": 0.3934885561466217,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000489950180054,
+      "sampling/importance_sampling_ratio/min": 9.891482477542013e-05,
+      "sampling/sampling_logp_difference/max": 9.221251487731934,
+      "sampling/sampling_logp_difference/mean": 0.02032080665230751,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.395576979732141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.395576979732141e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16307.0,
+      "completions/mean_length": 8483.390625,
+      "completions/mean_terminated_length": 7813.84765625,
+      "completions/min_length": 1342.0,
+      "completions/min_terminated_length": 1342.0,
+      "entropy": 0.9621479511260986,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003174177836626768,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 265995697.0,
+      "reward": 0.3359375,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.0005628522485494614,
+      "sampling/sampling_logp_difference/max": 7.4824934005737305,
+      "sampling/sampling_logp_difference/mean": 0.02145479805767536,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.2596524811669951e-05,
+      "clip_ratio/high_mean": 3.149131202917488e-06,
+      "clip_ratio/low_mean": 3.7911659774181317e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.106079018129094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14985.0,
+      "completions/mean_length": 7184.578125,
+      "completions/mean_terminated_length": 6963.79248046875,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.9993807673454285,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003356153378263116,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 266937707.0,
+      "reward": 0.3828125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000238418579102,
+      "sampling/importance_sampling_ratio/min": 0.0017036627978086472,
+      "sampling/sampling_logp_difference/max": 6.374974727630615,
+      "sampling/sampling_logp_difference/mean": 0.02204768732190132,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 1.9245163684900035e-05,
+      "clip_ratio/high_mean": 4.811290921225009e-06,
+      "clip_ratio/low_mean": 4.8845648166206956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.365693925796222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16216.0,
+      "completions/mean_length": 7029.2265625,
+      "completions/mean_terminated_length": 6727.45947265625,
+      "completions/min_length": 851.0,
+      "completions/min_terminated_length": 851.0,
+      "entropy": 0.9139953926205635,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006375293247401714,
+      "learning_rate": 1e-05,
+      "loss": 0.0519,
+      "num_tokens": 267853880.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.010649868287146091,
+      "sampling/sampling_logp_difference/max": 4.542207717895508,
+      "sampling/sampling_logp_difference/mean": 0.020365029573440552,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 4.812504812434781e-06,
+      "clip_ratio/high_mean": 1.2031262031086953e-06,
+      "clip_ratio/low_mean": 2.5999243803198624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.720237000630732e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6188.0078125,
+      "completions/mean_terminated_length": 5943.30419921875,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.7640773430466652,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003697809297591448,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 268665721.0,
+      "reward": 0.5078125,
+      "reward_std": 0.20699402689933777,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372363090515,
+      "sampling/importance_sampling_ratio/min": 0.02927250787615776,
+      "sampling/sampling_logp_difference/max": 3.531106472015381,
+      "sampling/sampling_logp_difference/mean": 0.016581017524003983,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1358927824621787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1358927824621787e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 8128.21875,
+      "completions/mean_terminated_length": 7861.90283203125,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.8218234181404114,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002286596456542611,
+      "learning_rate": 1e-05,
+      "loss": 0.0763,
+      "num_tokens": 269726181.0,
+      "reward": 0.375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999798536300659,
+      "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06,
+      "sampling/sampling_logp_difference/max": 12.90043830871582,
+      "sampling/sampling_logp_difference/mean": 0.019403984770178795,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 1.4808477317274082e-05,
+      "clip_ratio/high_mean": 3.7021193293185206e-06,
+      "clip_ratio/low_mean": 3.0363167581981543e-05,
+      "clip_ratio/low_min": 6.364238288369961e-06,
+      "clip_ratio/region_mean": 3.4065286854456645e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 5673.3359375,
+      "completions/mean_terminated_length": 5503.32568359375,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.9275510385632515,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00485506234690547,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 270470616.0,
+      "reward": 0.4921875,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.0009123464697040617,
+      "sampling/sampling_logp_difference/max": 6.999490737915039,
+      "sampling/sampling_logp_difference/mean": 0.01881871558725834,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 1.1274602456978755e-05,
+      "clip_ratio/high_mean": 3.6739949109687586e-06,
+      "clip_ratio/low_mean": 3.968570712231667e-05,
+      "clip_ratio/low_min": 3.4213767321489286e-06,
+      "clip_ratio/region_mean": 4.335970191959859e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 6944.8984375,
+      "completions/mean_terminated_length": 6795.07177734375,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9335741624236107,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005874342750757933,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 271377723.0,
+      "reward": 0.390625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000594854354858,
+      "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05,
+      "sampling/sampling_logp_difference/max": 10.049861907958984,
+      "sampling/sampling_logp_difference/mean": 0.020590776577591896,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 1.264126694877632e-05,
+      "clip_ratio/high_mean": 3.16031673719408e-06,
+      "clip_ratio/low_mean": 3.206376845810155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.522408474054828e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 7705.625,
+      "completions/mean_terminated_length": 7278.8193359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.8491624072194099,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001684082904830575,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 272384891.0,
+      "reward": 0.390625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 6.605865200981498e-05,
+      "sampling/sampling_logp_difference/max": 9.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.020136822015047073,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 9.772357770998497e-06,
+      "clip_ratio/high_mean": 2.443089442749624e-06,
+      "clip_ratio/low_mean": 3.8573590472879005e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.101667946088128e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6611.1484375,
+      "completions/mean_terminated_length": 6534.19677734375,
+      "completions/min_length": 1116.0,
+      "completions/min_terminated_length": 1116.0,
+      "entropy": 0.8867302760481834,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003692191792652011,
+      "learning_rate": 1e-05,
+      "loss": 0.1233,
+      "num_tokens": 273251630.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999606609344482,
+      "sampling/importance_sampling_ratio/min": 0.0031062732450664043,
+      "sampling/sampling_logp_difference/max": 5.774331569671631,
+      "sampling/sampling_logp_difference/mean": 0.019237037748098373,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 3.0103737344688852e-05,
+      "clip_ratio/high_mean": 9.664363972206047e-06,
+      "clip_ratio/low_mean": 1.7575501146893657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.723986426644842e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15786.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 6770.46875,
+      "completions/mean_terminated_length": 6770.46875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.8252957463264465,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004167635925114155,
+      "learning_rate": 1e-05,
+      "loss": -0.0072,
+      "num_tokens": 274146482.0,
+      "reward": 0.5703125,
+      "reward_std": 0.23486016690731049,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.00010247006866848096,
+      "sampling/sampling_logp_difference/max": 9.18593978881836,
+      "sampling/sampling_logp_difference/mean": 0.019684650003910065,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 6.529460733872838e-06,
+      "clip_ratio/high_mean": 1.6323651834682096e-06,
+      "clip_ratio/low_mean": 3.877351048231503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.040587566578324e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15827.0,
+      "completions/mean_length": 8210.859375,
+      "completions/mean_terminated_length": 7365.36181640625,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.8118235394358635,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030363225378096104,
+      "learning_rate": 1e-05,
+      "loss": 0.0531,
+      "num_tokens": 275214040.0,
+      "reward": 0.3515625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998943209648132,
+      "sampling/importance_sampling_ratio/min": 0.002854935359209776,
+      "sampling/sampling_logp_difference/max": 5.858705997467041,
+      "sampling/sampling_logp_difference/mean": 0.019275270402431488,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.0800629146106075e-06,
+      "clip_ratio/high_mean": 1.7700157286526519e-06,
+      "clip_ratio/low_mean": 2.3981688286767167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5751703674359305e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14900.0,
+      "completions/mean_length": 7072.8828125,
+      "completions/mean_terminated_length": 6849.41650390625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8018335327506065,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004777858033776283,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 276138049.0,
+      "reward": 0.453125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368190765381,
+      "sampling/importance_sampling_ratio/min": 0.0028502768836915493,
+      "sampling/sampling_logp_difference/max": 5.860339164733887,
+      "sampling/sampling_logp_difference/mean": 0.01849908009171486,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 2.259368602608447e-05,
+      "clip_ratio/high_mean": 5.648421506521117e-06,
+      "clip_ratio/low_mean": 4.28424866640853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.849090737479855e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14447.0,
+      "completions/mean_length": 5889.8359375,
+      "completions/mean_terminated_length": 5723.26220703125,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.7976400703191757,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030593445990234613,
+      "learning_rate": 1e-05,
+      "loss": 0.1331,
+      "num_tokens": 276910124.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999091029167175,
+      "sampling/importance_sampling_ratio/min": 0.000139843366923742,
+      "sampling/sampling_logp_difference/max": 8.874987602233887,
+      "sampling/sampling_logp_difference/mean": 0.01834402233362198,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 1.4654247024736833e-05,
+      "clip_ratio/high_mean": 3.663561756184208e-06,
+      "clip_ratio/low_mean": 2.377464920755301e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7438210736363544e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 7144.265625,
+      "completions/mean_terminated_length": 6689.85205078125,
+      "completions/min_length": 1200.0,
+      "completions/min_terminated_length": 1200.0,
+      "entropy": 0.8309404999017715,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004245694726705551,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 277843542.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998534321784973,
+      "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05,
+      "sampling/sampling_logp_difference/max": 11.499897956848145,
+      "sampling/sampling_logp_difference/mean": 0.01875344291329384,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 6.252500952541595e-06,
+      "clip_ratio/high_mean": 2.241558604509919e-06,
+      "clip_ratio/low_mean": 4.735765514851664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9599213525652885e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15722.0,
+      "completions/mean_length": 6779.5234375,
+      "completions/mean_terminated_length": 6703.8974609375,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.9584890529513359,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035574575886130333,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 278730129.0,
+      "reward": 0.3984375,
+      "reward_std": 0.32825323939323425,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.005792221520096064,
+      "sampling/sampling_logp_difference/max": 5.151239395141602,
+      "sampling/sampling_logp_difference/mean": 0.02137477695941925,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 3.2948471016425174e-05,
+      "clip_ratio/high_mean": 9.518853403278627e-06,
+      "clip_ratio/low_mean": 2.195712454522436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.14759782895635e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15892.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 5582.9765625,
+      "completions/mean_terminated_length": 5582.9765625,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8629376217722893,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037982752546668053,
+      "learning_rate": 1e-05,
+      "loss": 0.0331,
+      "num_tokens": 279462542.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3164186477661133,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999780058860779,
+      "sampling/importance_sampling_ratio/min": 0.0021874974481761456,
+      "sampling/sampling_logp_difference/max": 6.124997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01906203106045723,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.1029473625967512e-05,
+      "clip_ratio/high_mean": 2.757368406491878e-06,
+      "clip_ratio/low_mean": 5.367386921761863e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6431237737797346e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 6942.2578125,
+      "completions/mean_terminated_length": 6477.90966796875,
+      "completions/min_length": 1156.0,
+      "completions/min_terminated_length": 1156.0,
+      "entropy": 0.8147861957550049,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027678858023136854,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 280370207.0,
+      "reward": 0.4375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998471736907959,
+      "sampling/importance_sampling_ratio/min": 0.00023058800434228033,
+      "sampling/sampling_logp_difference/max": 8.3748779296875,
+      "sampling/sampling_logp_difference/mean": 0.01940828748047352,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 2.6367894406575942e-05,
+      "clip_ratio/high_mean": 8.765707434577052e-06,
+      "clip_ratio/low_mean": 3.232976985145797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.109547796815605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6242.53125,
+      "completions/mean_terminated_length": 5915.38671875,
+      "completions/min_length": 1220.0,
+      "completions/min_terminated_length": 1220.0,
+      "entropy": 0.878915011882782,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00577945914119482,
+      "learning_rate": 1e-05,
+      "loss": 0.0839,
+      "num_tokens": 281189491.0,
+      "reward": 0.515625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 9.611724817659706e-05,
+      "sampling/sampling_logp_difference/max": 9.2499418258667,
+      "sampling/sampling_logp_difference/mean": 0.01948760263621807,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 3.50839609382092e-05,
+      "clip_ratio/high_mean": 1.1664920634757436e-05,
+      "clip_ratio/low_mean": 1.833109013205103e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9996010880495305e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 7004.015625,
+      "completions/mean_terminated_length": 6622.71533203125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "entropy": 0.7964659407734871,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014128695474937558,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 282103997.0,
+      "reward": 0.4140625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.0024504722096025944,
+      "sampling/sampling_logp_difference/max": 6.011474609375,
+      "sampling/sampling_logp_difference/mean": 0.019019678235054016,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.832260545597819e-05,
+      "clip_ratio/high_mean": 4.580651363994548e-06,
+      "clip_ratio/low_mean": 5.309064226821647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.767129368905444e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7822.6953125,
+      "completions/mean_terminated_length": 7546.52392578125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.8571138679981232,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002476039342582226,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 283122382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999314546585083,
+      "sampling/importance_sampling_ratio/min": 0.0009774373611435294,
+      "sampling/sampling_logp_difference/max": 6.930576324462891,
+      "sampling/sampling_logp_difference/mean": 0.020557202398777008,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 5.738419986300869e-06,
+      "clip_ratio/high_mean": 1.4346049965752172e-06,
+      "clip_ratio/low_mean": 4.19679121819172e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3402517292179255e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7738.8984375,
+      "completions/mean_terminated_length": 6844.57763671875,
+      "completions/min_length": 897.0,
+      "completions/min_terminated_length": 897.0,
+      "entropy": 0.7839021533727646,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005309853237122297,
+      "learning_rate": 1e-05,
+      "loss": 0.043,
+      "num_tokens": 284130081.0,
+      "reward": 0.5234375,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998971223831177,
+      "sampling/importance_sampling_ratio/min": 0.0001319014554610476,
+      "sampling/sampling_logp_difference/max": 8.933455467224121,
+      "sampling/sampling_logp_difference/mean": 0.01873316988348961,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 1.007085802484653e-05,
+      "clip_ratio/high_mean": 2.5177145062116324e-06,
+      "clip_ratio/low_mean": 4.043528815600439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.295300277590286e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15952.0,
+      "completions/mean_length": 7102.2421875,
+      "completions/mean_terminated_length": 6954.9130859375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.8530801385641098,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004228116944432259,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 285058720.0,
+      "reward": 0.5078125,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00012956927821505815,
+      "sampling/sampling_logp_difference/max": 8.951294898986816,
+      "sampling/sampling_logp_difference/mean": 0.019325006753206253,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 4.06874551117653e-06,
+      "clip_ratio/high_mean": 1.0171863777941326e-06,
+      "clip_ratio/low_mean": 3.661125703047219e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.762844340826632e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15594.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6583.4765625,
+      "completions/mean_terminated_length": 6583.4765625,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 1.021921381354332,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004967439454048872,
+      "learning_rate": 1e-05,
+      "loss": 0.0374,
+      "num_tokens": 285919765.0,
+      "reward": 0.328125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00004243850708,
+      "sampling/importance_sampling_ratio/min": 0.016675354912877083,
+      "sampling/sampling_logp_difference/max": 4.093823432922363,
+      "sampling/sampling_logp_difference/mean": 0.021393200382590294,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.2215251445013564e-05,
+      "clip_ratio/high_mean": 3.053812861253391e-06,
+      "clip_ratio/low_mean": 4.05305947879242e-05,
+      "clip_ratio/low_min": 4.215567059873138e-06,
+      "clip_ratio/region_mean": 4.358440742180392e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16299.0,
+      "completions/mean_length": 7770.5859375,
+      "completions/mean_terminated_length": 7346.97509765625,
+      "completions/min_length": 1040.0,
+      "completions/min_terminated_length": 1040.0,
+      "entropy": 1.0466903448104858,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004189736675471067,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 286935512.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2369818240404129,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797344207764,
+      "sampling/importance_sampling_ratio/min": 0.011683559976518154,
+      "sampling/sampling_logp_difference/max": 4.449572563171387,
+      "sampling/sampling_logp_difference/mean": 0.021805983036756516,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 2.0567378214764176e-05,
+      "clip_ratio/high_mean": 5.141844553691044e-06,
+      "clip_ratio/low_mean": 1.8177100628236076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3318944840866607e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15758.0,
+      "completions/mean_length": 5689.2421875,
+      "completions/mean_terminated_length": 5432.568359375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.7778806164860725,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0032866497058421373,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 287681943.0,
+      "reward": 0.640625,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940812587738,
+      "sampling/importance_sampling_ratio/min": 0.00038077132194302976,
+      "sampling/sampling_logp_difference/max": 7.873311519622803,
+      "sampling/sampling_logp_difference/mean": 0.01789461076259613,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 3.109086901531555e-05,
+      "clip_ratio/high_mean": 7.772717253828887e-06,
+      "clip_ratio/low_mean": 3.1423560130861006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.919627738468989e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13820.0,
+      "completions/mean_length": 6288.1875,
+      "completions/mean_terminated_length": 6127.93701171875,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.7709921672940254,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023572889622300863,
+      "learning_rate": 1e-05,
+      "loss": 0.0746,
+      "num_tokens": 288506735.0,
+      "reward": 0.484375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474287033081,
+      "sampling/importance_sampling_ratio/min": 0.000430915504693985,
+      "sampling/sampling_logp_difference/max": 7.749598503112793,
+      "sampling/sampling_logp_difference/mean": 0.017407266423106194,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 3.4638953366084024e-05,
+      "clip_ratio/high_mean": 9.51674803673086e-06,
+      "clip_ratio/low_mean": 6.26047980176736e-05,
+      "clip_ratio/low_min": 5.51267930859467e-06,
+      "clip_ratio/region_mean": 7.212154741864651e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 6775.0234375,
+      "completions/mean_terminated_length": 6465.05615234375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9338318258523941,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034220058005303144,
+      "learning_rate": 1e-05,
+      "loss": 0.0986,
+      "num_tokens": 289395498.0,
+      "reward": 0.390625,
+      "reward_std": 0.34533774852752686,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603033065796,
+      "sampling/importance_sampling_ratio/min": 0.0317598432302475,
+      "sampling/sampling_logp_difference/max": 3.449552536010742,
+      "sampling/sampling_logp_difference/mean": 0.019930530339479446,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 7.159989991123439e-05,
+      "clip_ratio/low_min": 1.5592839645250933e-05,
+      "clip_ratio/region_mean": 7.159989991123439e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 7142.9375,
+      "completions/mean_terminated_length": 6844.83837890625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 0.971405878663063,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002513247774913907,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 290329082.0,
+      "reward": 0.328125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999737739562988,
+      "sampling/importance_sampling_ratio/min": 3.152207455059397e-07,
+      "sampling/sampling_logp_difference/max": 14.969992637634277,
+      "sampling/sampling_logp_difference/mean": 0.022366533055901527,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 1.6507752206962323e-05,
+      "clip_ratio/high_mean": 4.126938051740581e-06,
+      "clip_ratio/low_mean": 1.7493430505055585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1620368215735652e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15581.0,
+      "completions/mean_length": 6412.2109375,
+      "completions/mean_terminated_length": 6333.69287109375,
+      "completions/min_length": 544.0,
+      "completions/min_terminated_length": 544.0,
+      "entropy": 0.9136044681072235,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0056767817586660385,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 291170133.0,
+      "reward": 0.421875,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999720454216003,
+      "sampling/importance_sampling_ratio/min": 0.000458698661532253,
+      "sampling/sampling_logp_difference/max": 7.687117099761963,
+      "sampling/sampling_logp_difference/mean": 0.020012658089399338,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 8.26085442895419e-06,
+      "clip_ratio/high_mean": 2.0652136072385474e-06,
+      "clip_ratio/low_mean": 3.6938338666914206e-05,
+      "clip_ratio/low_min": 5.699044777429663e-06,
+      "clip_ratio/region_mean": 3.900355193309224e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16111.0,
+      "completions/mean_length": 8066.1015625,
+      "completions/mean_terminated_length": 7797.7822265625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 1.0789504647254944,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00243841833434999,
+      "learning_rate": 1e-05,
+      "loss": 0.0432,
+      "num_tokens": 292222082.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999664425849915,
+      "sampling/importance_sampling_ratio/min": 8.481895929435268e-05,
+      "sampling/sampling_logp_difference/max": 9.374991416931152,
+      "sampling/sampling_logp_difference/mean": 0.023650091141462326,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 5.320054697222076e-06,
+      "clip_ratio/high_mean": 1.330013674305519e-06,
+      "clip_ratio/low_mean": 1.9117383317279746e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0447396991585265e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15176.0,
+      "completions/mean_length": 6836.046875,
+      "completions/mean_terminated_length": 6606.896484375,
+      "completions/min_length": 785.0,
+      "completions/min_terminated_length": 785.0,
+      "entropy": 1.218759760260582,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0020856577903032303,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 293115984.0,
+      "reward": 0.21875,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999911785125732,
+      "sampling/importance_sampling_ratio/min": 2.784526441246271e-05,
+      "sampling/sampling_logp_difference/max": 10.488847732543945,
+      "sampling/sampling_logp_difference/mean": 0.022012067958712578,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 2.5695502699818462e-05,
+      "clip_ratio/high_mean": 7.549717793153832e-06,
+      "clip_ratio/low_mean": 4.6741323160404136e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.429104089671455e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15796.0,
+      "completions/mean_length": 7501.9921875,
+      "completions/mean_terminated_length": 7140.9345703125,
+      "completions/min_length": 1237.0,
+      "completions/min_terminated_length": 1237.0,
+      "entropy": 0.8940394818782806,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005163854919373989,
+      "learning_rate": 1e-05,
+      "loss": 0.0354,
+      "num_tokens": 294099503.0,
+      "reward": 0.328125,
+      "reward_std": 0.30904707312583923,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999276399612427,
+      "sampling/importance_sampling_ratio/min": 0.0006545600481331348,
+      "sampling/sampling_logp_difference/max": 7.331547260284424,
+      "sampling/sampling_logp_difference/mean": 0.020813245326280594,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 3.1606674838258186e-05,
+      "clip_ratio/high_mean": 9.45794374729303e-06,
+      "clip_ratio/low_mean": 4.5567895540443715e-05,
+      "clip_ratio/low_min": 4.458871444512624e-06,
+      "clip_ratio/region_mean": 5.502583962879726e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7204.828125,
+      "completions/mean_terminated_length": 6908.7255859375,
+      "completions/min_length": 846.0,
+      "completions/min_terminated_length": 846.0,
+      "entropy": 0.9961872175335884,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029277894645929337,
+      "learning_rate": 1e-05,
+      "loss": 0.0963,
+      "num_tokens": 295042105.0,
+      "reward": 0.390625,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000677108764648,
+      "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05,
+      "sampling/sampling_logp_difference/max": 10.872637748718262,
+      "sampling/sampling_logp_difference/mean": 0.020187582820653915,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 1.7963964182854397e-05,
+      "clip_ratio/high_mean": 5.194059781388205e-06,
+      "clip_ratio/low_mean": 1.8380221035840805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.357428081722901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15856.0,
+      "completions/mean_length": 6256.859375,
+      "completions/mean_terminated_length": 6013.80810546875,
+      "completions/min_length": 1006.0,
+      "completions/min_terminated_length": 1006.0,
+      "entropy": 0.9293600022792816,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032952844630926847,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 295867039.0,
+      "reward": 0.46875,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999649524688721,
+      "sampling/importance_sampling_ratio/min": 7.995560008566827e-05,
+      "sampling/sampling_logp_difference/max": 9.434039115905762,
+      "sampling/sampling_logp_difference/mean": 0.019491540268063545,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 7.577551059512189e-06,
+      "clip_ratio/high_mean": 1.8943877648780472e-06,
+      "clip_ratio/low_mean": 2.7479814093567256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9374201631071628e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15412.0,
+      "completions/mean_length": 7397.84375,
+      "completions/mean_terminated_length": 7032.552734375,
+      "completions/min_length": 923.0,
+      "completions/min_terminated_length": 923.0,
+      "entropy": 0.8508890569210052,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029417150653898716,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 296832843.0,
+      "reward": 0.375,
+      "reward_std": 0.2867125868797302,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000183582305908,
+      "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05,
+      "sampling/sampling_logp_difference/max": 10.93724250793457,
+      "sampling/sampling_logp_difference/mean": 0.01975393109023571,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 3.281225508544594e-05,
+      "clip_ratio/high_mean": 1.3302957199812226e-05,
+      "clip_ratio/low_mean": 5.109179869577929e-05,
+      "clip_ratio/low_min": 6.657612175331451e-06,
+      "clip_ratio/region_mean": 6.439475532715733e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14983.0,
+      "completions/mean_length": 6897.765625,
+      "completions/mean_terminated_length": 6823.07080078125,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.9046694040298462,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026788609102368355,
+      "learning_rate": 1e-05,
+      "loss": 0.0664,
+      "num_tokens": 297735285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3266732692718506,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999909520149231,
+      "sampling/importance_sampling_ratio/min": 0.001710799871943891,
+      "sampling/sampling_logp_difference/max": 6.370794296264648,
+      "sampling/sampling_logp_difference/mean": 0.020578179508447647,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 1.7319889593636617e-05,
+      "clip_ratio/high_mean": 5.168538336874917e-06,
+      "clip_ratio/low_mean": 7.019768918326008e-05,
+      "clip_ratio/low_min": 2.541147478041239e-05,
+      "clip_ratio/region_mean": 7.53662266106403e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15525.0,
+      "completions/mean_length": 6971.9921875,
+      "completions/mean_terminated_length": 6509.10595703125,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "entropy": 0.8658201694488525,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005915141198784113,
+      "learning_rate": 1e-05,
+      "loss": 0.0923,
+      "num_tokens": 298645124.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3742823898792267,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999268651008606,
+      "sampling/importance_sampling_ratio/min": 0.000970841443631798,
+      "sampling/sampling_logp_difference/max": 6.937347412109375,
+      "sampling/sampling_logp_difference/mean": 0.01906151883304119,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.8332865238335216e-05,
+      "clip_ratio/high_mean": 4.583216309583804e-06,
+      "clip_ratio/low_mean": 6.167940273371642e-05,
+      "clip_ratio/low_min": 5.969151516183047e-06,
+      "clip_ratio/region_mean": 6.626261847486603e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15054.0,
+      "completions/mean_length": 6545.6953125,
+      "completions/mean_terminated_length": 5889.80859375,
+      "completions/min_length": 800.0,
+      "completions/min_terminated_length": 800.0,
+      "entropy": 0.779609851539135,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0032792428974062204,
+      "learning_rate": 1e-05,
+      "loss": 0.097,
+      "num_tokens": 299503781.0,
+      "reward": 0.609375,
+      "reward_std": 0.38293448090553284,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999361634254456,
+      "sampling/importance_sampling_ratio/min": 0.002187495119869709,
+      "sampling/sampling_logp_difference/max": 6.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.017413027584552765,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.46246323235755e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.46246323235755e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7226.515625,
+      "completions/mean_terminated_length": 7006.736328125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9573849961161613,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005092279519885778,
+      "learning_rate": 1e-05,
+      "loss": 0.1102,
+      "num_tokens": 300447903.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999373555183411,
+      "sampling/importance_sampling_ratio/min": 0.000627054600045085,
+      "sampling/sampling_logp_difference/max": 7.374476909637451,
+      "sampling/sampling_logp_difference/mean": 0.021570835262537003,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 5.487269390869187e-06,
+      "clip_ratio/high_mean": 1.3718173477172968e-06,
+      "clip_ratio/low_mean": 4.7280102080549113e-05,
+      "clip_ratio/low_min": 1.0166083029616857e-05,
+      "clip_ratio/region_mean": 4.865191931457957e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14967.0,
+      "completions/mean_length": 5755.171875,
+      "completions/mean_terminated_length": 5323.10546875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8482184633612633,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005033228080719709,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 301206021.0,
+      "reward": 0.390625,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999947547912598,
+      "sampling/importance_sampling_ratio/min": 0.0014573346124961972,
+      "sampling/sampling_logp_difference/max": 6.531146049499512,
+      "sampling/sampling_logp_difference/mean": 0.018870476633310318,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 5.421346941147931e-06,
+      "clip_ratio/high_mean": 1.3553367352869827e-06,
+      "clip_ratio/low_mean": 1.6510994441887306e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.786633117717429e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 7098.7265625,
+      "completions/mean_terminated_length": 6875.88037109375,
+      "completions/min_length": 947.0,
+      "completions/min_terminated_length": 947.0,
+      "entropy": 0.87320177257061,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.007659573573619127,
+      "learning_rate": 1e-05,
+      "loss": 0.0707,
+      "num_tokens": 302133890.0,
+      "reward": 0.421875,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0012466582702472806,
+      "sampling/sampling_logp_difference/max": 6.687288761138916,
+      "sampling/sampling_logp_difference/mean": 0.019994346424937248,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 1.1556229310372146e-05,
+      "clip_ratio/high_mean": 2.8890573275930365e-06,
+      "clip_ratio/low_mean": 3.8744643916288624e-05,
+      "clip_ratio/low_min": 6.108287834649673e-06,
+      "clip_ratio/region_mean": 4.1633702039689524e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16139.0,
+      "completions/mean_length": 6399.96875,
+      "completions/mean_terminated_length": 6077.90283203125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9481896534562111,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014135175151750445,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 302972566.0,
+      "reward": 0.4140625,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0025698256213217974,
+      "sampling/sampling_logp_difference/max": 5.963917255401611,
+      "sampling/sampling_logp_difference/mean": 0.02073008380830288,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 6.59491388432798e-06,
+      "clip_ratio/high_mean": 2.545892130001448e-06,
+      "clip_ratio/low_mean": 4.620846755187813e-05,
+      "clip_ratio/low_min": 6.243132702365983e-06,
+      "clip_ratio/region_mean": 4.875435956819274e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 7298.078125,
+      "completions/mean_terminated_length": 7226.53564453125,
+      "completions/min_length": 1009.0,
+      "completions/min_terminated_length": 1009.0,
+      "entropy": 0.8719206526875496,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027898226398974657,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 303925976.0,
+      "reward": 0.484375,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.005236432887613773,
+      "sampling/sampling_logp_difference/max": 5.252114772796631,
+      "sampling/sampling_logp_difference/mean": 0.020944103598594666,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 1.052124343914329e-05,
+      "clip_ratio/high_mean": 2.6303108597858227e-06,
+      "clip_ratio/low_mean": 2.010384196182713e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.273415248055244e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14980.0,
+      "completions/mean_length": 5667.0390625,
+      "completions/mean_terminated_length": 5496.9287109375,
+      "completions/min_length": 974.0,
+      "completions/min_terminated_length": 974.0,
+      "entropy": 0.8791451379656792,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012764945859089494,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 304675157.0,
+      "reward": 0.390625,
+      "reward_std": 0.17965976893901825,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000383853912354,
+      "sampling/importance_sampling_ratio/min": 5.054428584116977e-06,
+      "sampling/sampling_logp_difference/max": 12.195245742797852,
+      "sampling/sampling_logp_difference/mean": 0.018928447738289833,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 9.578045592206763e-06,
+      "clip_ratio/high_mean": 2.3945113980516908e-06,
+      "clip_ratio/low_mean": 3.1114799753595435e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350931149270764e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15354.0,
+      "completions/max_terminated_length": 15354.0,
+      "completions/mean_length": 5874.4453125,
+      "completions/mean_terminated_length": 5874.4453125,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9577538818120956,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00509974779561162,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 305447038.0,
+      "reward": 0.515625,
+      "reward_std": 0.24777325987815857,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999423027038574,
+      "sampling/importance_sampling_ratio/min": 0.004791648127138615,
+      "sampling/sampling_logp_difference/max": 5.340880870819092,
+      "sampling/sampling_logp_difference/mean": 0.02114470861852169,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 1.0903062275247066e-05,
+      "clip_ratio/high_mean": 2.7257655688117666e-06,
+      "clip_ratio/low_mean": 4.784364205079328e-05,
+      "clip_ratio/low_min": 3.861600362142781e-06,
+      "clip_ratio/region_mean": 5.056940744907479e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 6197.5703125,
+      "completions/mean_terminated_length": 6035.88134765625,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.8665244281291962,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030849494505673647,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 306258023.0,
+      "reward": 0.515625,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998056888580322,
+      "sampling/importance_sampling_ratio/min": 0.000830297009088099,
+      "sampling/sampling_logp_difference/max": 7.093727111816406,
+      "sampling/sampling_logp_difference/mean": 0.021017421036958694,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 1.4299712574938894e-05,
+      "clip_ratio/high_mean": 4.3520980170796975e-06,
+      "clip_ratio/low_mean": 6.213493452378316e-05,
+      "clip_ratio/low_min": 1.0056635801447555e-05,
+      "clip_ratio/region_mean": 6.648703174505499e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 7522.578125,
+      "completions/mean_terminated_length": 7381.9208984375,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.8185881152749062,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002946985885500908,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 307240305.0,
+      "reward": 0.3125,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.005127199459820986,
+      "sampling/sampling_logp_difference/max": 5.273195743560791,
+      "sampling/sampling_logp_difference/mean": 0.01965932548046112,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.693051035545068e-05,
+      "clip_ratio/high_mean": 5.08456730585749e-06,
+      "clip_ratio/low_mean": 4.2052345861520735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.713691282631771e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14090.0,
+      "completions/mean_length": 6403.2265625,
+      "completions/mean_terminated_length": 6163.6884765625,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "entropy": 0.8359840363264084,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031181599479168653,
+      "learning_rate": 1e-05,
+      "loss": 0.072,
+      "num_tokens": 308079318.0,
+      "reward": 0.5,
+      "reward_std": 0.27145031094551086,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999215602874756,
+      "sampling/importance_sampling_ratio/min": 6.73715621815063e-05,
+      "sampling/sampling_logp_difference/max": 9.605287551879883,
+      "sampling/sampling_logp_difference/mean": 0.01963040418922901,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 1.3988919135954347e-05,
+      "clip_ratio/high_mean": 3.497229783988587e-06,
+      "clip_ratio/low_mean": 6.722658486069122e-05,
+      "clip_ratio/low_min": 1.858519090092159e-05,
+      "clip_ratio/region_mean": 7.072381458783639e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16148.0,
+      "completions/mean_length": 7954.03125,
+      "completions/mean_terminated_length": 7751.71240234375,
+      "completions/min_length": 632.0,
+      "completions/min_terminated_length": 632.0,
+      "entropy": 0.905990719795227,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002656223252415657,
+      "learning_rate": 1e-05,
+      "loss": 0.1022,
+      "num_tokens": 309117770.0,
+      "reward": 0.3828125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999536275863647,
+      "sampling/importance_sampling_ratio/min": 0.0003354826185386628,
+      "sampling/sampling_logp_difference/max": 7.999940395355225,
+      "sampling/sampling_logp_difference/mean": 0.020741507411003113,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.7610595023143105e-05,
+      "clip_ratio/high_mean": 4.402648755785776e-06,
+      "clip_ratio/low_mean": 4.337988764291367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.778253651238629e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6630.09375,
+      "completions/mean_terminated_length": 6315.45166015625,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.870736837387085,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0060529084876179695,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 309988894.0,
+      "reward": 0.515625,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998822212219238,
+      "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05,
+      "sampling/sampling_logp_difference/max": 10.716434478759766,
+      "sampling/sampling_logp_difference/mean": 0.02060208097100258,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 1.0448093235027045e-05,
+      "clip_ratio/high_mean": 2.6120233087567613e-06,
+      "clip_ratio/low_mean": 3.1030769946482906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.364279325523967e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15920.0,
+      "completions/max_terminated_length": 15920.0,
+      "completions/mean_length": 6679.6171875,
+      "completions/mean_terminated_length": 6679.6171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9812518879771233,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00400698184967041,
+      "learning_rate": 1e-05,
+      "loss": 0.0605,
+      "num_tokens": 310864013.0,
+      "reward": 0.421875,
+      "reward_std": 0.3295465111732483,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999049305915833,
+      "sampling/importance_sampling_ratio/min": 0.0020593837834894657,
+      "sampling/sampling_logp_difference/max": 6.1853485107421875,
+      "sampling/sampling_logp_difference/mean": 0.02098071575164795,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 2.124982574969181e-05,
+      "clip_ratio/high_mean": 7.736592579021817e-06,
+      "clip_ratio/low_mean": 2.900951585615985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.674610888992902e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14541.0,
+      "completions/mean_length": 5523.796875,
+      "completions/mean_terminated_length": 5173.4677734375,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9120645374059677,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005929585546255112,
+      "learning_rate": 1e-05,
+      "loss": 0.0362,
+      "num_tokens": 311589987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998446702957153,
+      "sampling/importance_sampling_ratio/min": 0.0010661041596904397,
+      "sampling/sampling_logp_difference/max": 6.843744277954102,
+      "sampling/sampling_logp_difference/mean": 0.019948206841945648,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 2.4486997745043482e-05,
+      "clip_ratio/high_mean": 8.219769085826556e-06,
+      "clip_ratio/low_mean": 5.346400575945154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.168377467474784e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15401.0,
+      "completions/mean_length": 6361.3671875,
+      "completions/mean_terminated_length": 6282.44873046875,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.8044678047299385,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006622390355914831,
+      "learning_rate": 1e-05,
+      "loss": 0.1023,
+      "num_tokens": 312424034.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3724474310874939,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000219345092773,
+      "sampling/importance_sampling_ratio/min": 0.0003157092141918838,
+      "sampling/sampling_logp_difference/max": 8.060688972473145,
+      "sampling/sampling_logp_difference/mean": 0.018907658755779266,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 1.0407376748844399e-05,
+      "clip_ratio/high_mean": 2.6018441872110998e-06,
+      "clip_ratio/low_mean": 5.925514369664597e-05,
+      "clip_ratio/low_min": 1.3324347946763737e-05,
+      "clip_ratio/region_mean": 6.185698703120579e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15883.0,
+      "completions/mean_length": 7109.0,
+      "completions/mean_terminated_length": 7035.96826171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9167275875806808,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004639944992959499,
+      "learning_rate": 1e-05,
+      "loss": 0.0861,
+      "num_tokens": 313353346.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3826971650123596,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999389052391052,
+      "sampling/importance_sampling_ratio/min": 0.0019070414127781987,
+      "sampling/sampling_logp_difference/max": 6.262202262878418,
+      "sampling/sampling_logp_difference/mean": 0.02155841514468193,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 3.959046694035351e-05,
+      "clip_ratio/high_mean": 1.0912523691786191e-05,
+      "clip_ratio/low_mean": 3.3944450819944905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.485697365907981e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6314.2734375,
+      "completions/mean_terminated_length": 6072.60009765625,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.8780038207769394,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.007643720600754023,
+      "learning_rate": 1e-05,
+      "loss": 0.0873,
+      "num_tokens": 314180717.0,
+      "reward": 0.4609375,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999802112579346,
+      "sampling/importance_sampling_ratio/min": 0.021285315975546837,
+      "sampling/sampling_logp_difference/max": 3.8497378826141357,
+      "sampling/sampling_logp_difference/mean": 0.01964358240365982,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 3.065382111344661e-05,
+      "clip_ratio/high_mean": 9.187473835936544e-06,
+      "clip_ratio/low_mean": 4.137891801292426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.056639065514901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6718.2265625,
+      "completions/mean_terminated_length": 6486.24853515625,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.8326799497008324,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050973957404494286,
+      "learning_rate": 1e-05,
+      "loss": 0.0109,
+      "num_tokens": 315060842.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3521803915500641,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000014066696167,
+      "sampling/importance_sampling_ratio/min": 0.0009130688849836588,
+      "sampling/sampling_logp_difference/max": 6.998699188232422,
+      "sampling/sampling_logp_difference/mean": 0.019501537084579468,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 8.624853762739804e-06,
+      "clip_ratio/high_mean": 2.156213440684951e-06,
+      "clip_ratio/low_mean": 1.8797969062234188e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0954182048171788e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16128.0,
+      "completions/mean_length": 8666.8359375,
+      "completions/mean_terminated_length": 7941.291015625,
+      "completions/min_length": 565.0,
+      "completions/min_terminated_length": 565.0,
+      "entropy": 0.9526705741882324,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019092690199613571,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 316190325.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999814629554749,
+      "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05,
+      "sampling/sampling_logp_difference/max": 10.249995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02051631174981594,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 2.147400391550036e-05,
+      "clip_ratio/high_mean": 6.434908300434472e-06,
+      "clip_ratio/low_mean": 3.521234066283796e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.164724816746457e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15164.0,
+      "completions/mean_length": 7661.8203125,
+      "completions/mean_terminated_length": 7002.16015625,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 0.8322782590985298,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019530428107827902,
+      "learning_rate": 1e-05,
+      "loss": 0.0729,
+      "num_tokens": 317191878.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21382391452789307,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 8.546619210392237e-05,
+      "sampling/sampling_logp_difference/max": 9.367389678955078,
+      "sampling/sampling_logp_difference/mean": 0.019894573837518692,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.9436202364886412e-05,
+      "clip_ratio/high_mean": 6.089704697842535e-06,
+      "clip_ratio/low_mean": 4.2698405422925134e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.878810955233348e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15934.0,
+      "completions/mean_length": 7024.859375,
+      "completions/mean_terminated_length": 6800.240234375,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.794853538274765,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031784537713974714,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 318109004.0,
+      "reward": 0.4921875,
+      "reward_std": 0.31800347566604614,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999352693557739,
+      "sampling/importance_sampling_ratio/min": 0.0002962362195830792,
+      "sampling/sampling_logp_difference/max": 8.124353408813477,
+      "sampling/sampling_logp_difference/mean": 0.018519200384616852,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 4.127455667912727e-06,
+      "clip_ratio/high_mean": 1.0318639169781818e-06,
+      "clip_ratio/low_mean": 4.342453667049995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.445640047379129e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 7282.1796875,
+      "completions/mean_terminated_length": 6912.1865234375,
+      "completions/min_length": 870.0,
+      "completions/min_terminated_length": 870.0,
+      "entropy": 0.904067650437355,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005080109462141991,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 319059075.0,
+      "reward": 0.4140625,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000062108039856,
+      "sampling/importance_sampling_ratio/min": 0.1194523349404335,
+      "sampling/sampling_logp_difference/max": 6.136754989624023,
+      "sampling/sampling_logp_difference/mean": 0.019978653639554977,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.608940076243016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.608940076243016e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15625.0,
+      "completions/mean_length": 7131.5234375,
+      "completions/mean_terminated_length": 6596.255859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.8849587142467499,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022667953744530678,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 319990046.0,
+      "reward": 0.46875,
+      "reward_std": 0.30221715569496155,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0370909757912159,
+      "sampling/sampling_logp_difference/max": 3.294381618499756,
+      "sampling/sampling_logp_difference/mean": 0.02037571743130684,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 1.5356635913121863e-05,
+      "clip_ratio/high_mean": 3.839158978280466e-06,
+      "clip_ratio/low_mean": 3.4950805911648786e-05,
+      "clip_ratio/low_min": 4.876336333836662e-06,
+      "clip_ratio/region_mean": 3.8789965287833184e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16205.0,
+      "completions/mean_length": 6655.4453125,
+      "completions/mean_terminated_length": 6578.84228515625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.7417122721672058,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00216497085057199,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 320860135.0,
+      "reward": 0.5625,
+      "reward_std": 0.3369230031967163,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 0.0005190494703128934,
+      "sampling/sampling_logp_difference/max": 7.563511371612549,
+      "sampling/sampling_logp_difference/mean": 0.01771342009305954,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 1.7605634639039636e-05,
+      "clip_ratio/high_mean": 5.297029474604642e-06,
+      "clip_ratio/low_mean": 5.688933060810086e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.218636053745286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15849.0,
+      "completions/mean_length": 7077.1640625,
+      "completions/mean_terminated_length": 6619.45068359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 0.8749325424432755,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0028338562697172165,
+      "learning_rate": 1e-05,
+      "loss": 0.0643,
+      "num_tokens": 321783852.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998220205307007,
+      "sampling/importance_sampling_ratio/min": 7.83290306571871e-06,
+      "sampling/sampling_logp_difference/max": 11.757177352905273,
+      "sampling/sampling_logp_difference/mean": 0.020299233496189117,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 7.301828190975357e-06,
+      "clip_ratio/high_mean": 1.8254570477438392e-06,
+      "clip_ratio/low_mean": 5.158197632226802e-05,
+      "clip_ratio/low_min": 3.735804057214409e-06,
+      "clip_ratio/region_mean": 5.340743223314348e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15329.0,
+      "completions/mean_length": 6034.296875,
+      "completions/mean_terminated_length": 5525.294921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.80014718323946,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022897711023688316,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 322572882.0,
+      "reward": 0.40625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999347925186157,
+      "sampling/importance_sampling_ratio/min": 0.0004105660773348063,
+      "sampling/sampling_logp_difference/max": 7.7979736328125,
+      "sampling/sampling_logp_difference/mean": 0.01858348958194256,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 9.364057859784225e-06,
+      "clip_ratio/high_mean": 3.351393047523743e-06,
+      "clip_ratio/low_mean": 4.186752630630508e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5218919240141986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 8172.109375,
+      "completions/mean_terminated_length": 7838.29248046875,
+      "completions/min_length": 733.0,
+      "completions/min_terminated_length": 733.0,
+      "entropy": 0.8732693120837212,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003263789461925626,
+      "learning_rate": 1e-05,
+      "loss": 0.0356,
+      "num_tokens": 323640904.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3237774670124054,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999354481697083,
+      "sampling/importance_sampling_ratio/min": 9.27252222027164e-06,
+      "sampling/sampling_logp_difference/max": 11.588455200195312,
+      "sampling/sampling_logp_difference/mean": 0.0208889190107584,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 2.0998899799451465e-05,
+      "clip_ratio/high_mean": 6.692962131182867e-06,
+      "clip_ratio/low_mean": 4.261424010110204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.930720297124935e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 7699.203125,
+      "completions/mean_terminated_length": 7419.04833984375,
+      "completions/min_length": 1225.0,
+      "completions/min_terminated_length": 1225.0,
+      "entropy": 0.8296505436301231,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0042716520838439465,
+      "learning_rate": 1e-05,
+      "loss": 0.0937,
+      "num_tokens": 324643858.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874234199524,
+      "sampling/importance_sampling_ratio/min": 0.00022192654432728887,
+      "sampling/sampling_logp_difference/max": 8.413164138793945,
+      "sampling/sampling_logp_difference/mean": 0.018926654011011124,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 7.061349151626928e-06,
+      "clip_ratio/high_mean": 1.765337287906732e-06,
+      "clip_ratio/low_mean": 4.5005243464402156e-05,
+      "clip_ratio/low_min": 3.861838649754645e-06,
+      "clip_ratio/region_mean": 4.6770580411248375e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16364.0,
+      "completions/max_terminated_length": 16364.0,
+      "completions/mean_length": 7450.1640625,
+      "completions/mean_terminated_length": 7450.1640625,
+      "completions/min_length": 910.0,
+      "completions/min_terminated_length": 910.0,
+      "entropy": 1.0400195196270943,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033558050636202097,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 325617687.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999459385871887,
+      "sampling/importance_sampling_ratio/min": 0.039920732378959656,
+      "sampling/sampling_logp_difference/max": 3.2208595275878906,
+      "sampling/sampling_logp_difference/mean": 0.02249298244714737,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 1.3147802746971138e-05,
+      "clip_ratio/high_mean": 3.2869506867427845e-06,
+      "clip_ratio/low_mean": 2.4451034505545977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7737984851228248e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15342.0,
+      "completions/mean_length": 6799.0703125,
+      "completions/mean_terminated_length": 6723.5986328125,
+      "completions/min_length": 1708.0,
+      "completions/min_terminated_length": 1708.0,
+      "entropy": 0.9737623482942581,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005797459278255701,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 326508384.0,
+      "reward": 0.3125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999321699142456,
+      "sampling/importance_sampling_ratio/min": 7.535634836131067e-07,
+      "sampling/sampling_logp_difference/max": 14.0984525680542,
+      "sampling/sampling_logp_difference/mean": 0.021543748676776886,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 3.3594023989280686e-06,
+      "clip_ratio/high_mean": 8.398505997320171e-07,
+      "clip_ratio/low_mean": 2.3457610382138228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4297460981870245e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 7034.3671875,
+      "completions/mean_terminated_length": 6654.30078125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8749603256583214,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002258980879560113,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 327426407.0,
+      "reward": 0.4609375,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999661445617676,
+      "sampling/importance_sampling_ratio/min": 0.008719252422451973,
+      "sampling/sampling_logp_difference/max": 4.742221832275391,
+      "sampling/sampling_logp_difference/mean": 0.01997346058487892,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 2.823375348270929e-05,
+      "clip_ratio/high_mean": 7.058438370677322e-06,
+      "clip_ratio/low_mean": 4.9395109726901865e-05,
+      "clip_ratio/low_min": 1.636556044104509e-05,
+      "clip_ratio/region_mean": 5.6453548268109444e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15240.0,
+      "completions/mean_length": 6623.078125,
+      "completions/mean_terminated_length": 6388.81640625,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.858784057199955,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002420129720121622,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 328292985.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537417411804,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998596906661987,
+      "sampling/importance_sampling_ratio/min": 0.00014900295354891568,
+      "sampling/sampling_logp_difference/max": 8.811544418334961,
+      "sampling/sampling_logp_difference/mean": 0.019645996391773224,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 1.8078507309837732e-05,
+      "clip_ratio/high_mean": 6.468551191574079e-06,
+      "clip_ratio/low_mean": 4.051302585139638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.698157727034413e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15229.0,
+      "completions/mean_length": 5902.4765625,
+      "completions/mean_terminated_length": 5564.36279296875,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.904740035533905,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004107976797968149,
+      "learning_rate": 1e-05,
+      "loss": 0.0824,
+      "num_tokens": 329067006.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3945493996143341,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999526143074036,
+      "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05,
+      "sampling/sampling_logp_difference/max": 11.37439250946045,
+      "sampling/sampling_logp_difference/mean": 0.019582755863666534,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 2.553658168835682e-05,
+      "clip_ratio/high_mean": 7.276365181496658e-06,
+      "clip_ratio/low_mean": 1.7552573126522475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.482893796695862e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6425.6015625,
+      "completions/mean_terminated_length": 6267.5322265625,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.964553713798523,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003208522219210863,
+      "learning_rate": 1e-05,
+      "loss": 0.0164,
+      "num_tokens": 329910691.0,
+      "reward": 0.359375,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999419450759888,
+      "sampling/importance_sampling_ratio/min": 0.00137569778598845,
+      "sampling/sampling_logp_difference/max": 6.588794231414795,
+      "sampling/sampling_logp_difference/mean": 0.021154657006263733,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 6.8712420215888415e-06,
+      "clip_ratio/high_mean": 1.7178105053972104e-06,
+      "clip_ratio/low_mean": 4.0991827404468495e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2709637853022286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 8006.4453125,
+      "completions/mean_terminated_length": 7594.43408203125,
+      "completions/min_length": 1235.0,
+      "completions/min_terminated_length": 1235.0,
+      "entropy": 0.8980336412787437,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002898421371355653,
+      "learning_rate": 1e-05,
+      "loss": 0.0815,
+      "num_tokens": 330956332.0,
+      "reward": 0.4296875,
+      "reward_std": 0.20175684988498688,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 9.378339746035635e-05,
+      "sampling/sampling_logp_difference/max": 9.27452278137207,
+      "sampling/sampling_logp_difference/mean": 0.021021340042352676,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.2689344689297286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2689344689297286e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15484.0,
+      "completions/max_terminated_length": 15484.0,
+      "completions/mean_length": 7068.828125,
+      "completions/mean_terminated_length": 7068.828125,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.9865007549524307,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0037063576746731997,
+      "learning_rate": 1e-05,
+      "loss": 0.0313,
+      "num_tokens": 331880918.0,
+      "reward": 0.3203125,
+      "reward_std": 0.17859892547130585,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0001819290773710236,
+      "sampling/sampling_logp_difference/max": 8.611893653869629,
+      "sampling/sampling_logp_difference/mean": 0.02072504535317421,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 5.845633268108941e-06,
+      "clip_ratio/high_mean": 1.4614083170272352e-06,
+      "clip_ratio/low_mean": 3.207486906831036e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353627721480734e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 7379.390625,
+      "completions/mean_terminated_length": 7236.4609375,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.8977236375212669,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001972826896235347,
+      "learning_rate": 1e-05,
+      "loss": 0.0228,
+      "num_tokens": 332849112.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 2.820451663865242e-05,
+      "sampling/sampling_logp_difference/max": 10.476028442382812,
+      "sampling/sampling_logp_difference/mean": 0.019411223009228706,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 4.875385002378607e-06,
+      "clip_ratio/high_mean": 1.2188462505946518e-06,
+      "clip_ratio/low_mean": 2.3530714997832547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.47495612484272e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15517.0,
+      "completions/mean_length": 6867.9609375,
+      "completions/mean_terminated_length": 6793.03125,
+      "completions/min_length": 760.0,
+      "completions/min_terminated_length": 760.0,
+      "entropy": 0.9244343340396881,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.006926023401319981,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 333746179.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1433562934398651,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.0003875594411510974,
+      "sampling/sampling_logp_difference/max": 7.8556413650512695,
+      "sampling/sampling_logp_difference/mean": 0.020311862230300903,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 1.5651628245905158e-05,
+      "clip_ratio/high_mean": 4.836261211949022e-06,
+      "clip_ratio/low_mean": 5.268017821435933e-05,
+      "clip_ratio/low_min": 3.950945028918795e-06,
+      "clip_ratio/region_mean": 5.751643902840442e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 7525.375,
+      "completions/mean_terminated_length": 6855.3955078125,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9207312315702438,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0047226278111338615,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 334731027.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3353874683380127,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999615550041199,
+      "sampling/importance_sampling_ratio/min": 0.00029753465787507594,
+      "sampling/sampling_logp_difference/max": 8.119979858398438,
+      "sampling/sampling_logp_difference/mean": 0.021496692672371864,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 3.815379886873416e-05,
+      "clip_ratio/high_mean": 9.53844971718354e-06,
+      "clip_ratio/low_mean": 4.519663821156428e-05,
+      "clip_ratio/low_min": 2.775434040813707e-06,
+      "clip_ratio/region_mean": 5.473508826980833e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16251.0,
+      "completions/mean_length": 6841.0625,
+      "completions/mean_terminated_length": 6453.13818359375,
+      "completions/min_length": 689.0,
+      "completions/min_terminated_length": 689.0,
+      "entropy": 0.8979457840323448,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004971448332071304,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 335631243.0,
+      "reward": 0.390625,
+      "reward_std": 0.2596156895160675,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999934196472168,
+      "sampling/importance_sampling_ratio/min": 9.655764188210014e-06,
+      "sampling/sampling_logp_difference/max": 11.547955513000488,
+      "sampling/sampling_logp_difference/mean": 0.020256079733371735,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 4.162365712545579e-06,
+      "clip_ratio/high_mean": 1.0405914281363948e-06,
+      "clip_ratio/low_mean": 3.1563491688757495e-05,
+      "clip_ratio/low_min": 3.1228139505401487e-06,
+      "clip_ratio/region_mean": 3.260408311689389e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15060.0,
+      "completions/mean_length": 6919.8046875,
+      "completions/mean_terminated_length": 6454.35205078125,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9241961911320686,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038604787550866604,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 336537162.0,
+      "reward": 0.375,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998080730438232,
+      "sampling/importance_sampling_ratio/min": 0.0009118975722230971,
+      "sampling/sampling_logp_difference/max": 6.999982833862305,
+      "sampling/sampling_logp_difference/mean": 0.02030865103006363,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 6.5182248363271356e-06,
+      "clip_ratio/high_mean": 1.6295562090817839e-06,
+      "clip_ratio/low_mean": 4.3847362121596234e-05,
+      "clip_ratio/low_min": 6.294533704931382e-06,
+      "clip_ratio/region_mean": 4.547691833067802e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15692.0,
+      "completions/mean_length": 7679.390625,
+      "completions/mean_terminated_length": 7099.08349609375,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "entropy": 1.0165777206420898,
+      "epoch": 0.35418583256669733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004624314606189728,
+      "learning_rate": 1e-05,
+      "loss": 0.0849,
+      "num_tokens": 337542492.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999251961708069,
+      "sampling/importance_sampling_ratio/min": 5.83546279813163e-05,
+      "sampling/sampling_logp_difference/max": 9.748971939086914,
+      "sampling/sampling_logp_difference/mean": 0.02206476218998432,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 6.00499606662197e-06,
+      "clip_ratio/high_mean": 1.5012490166554926e-06,
+      "clip_ratio/low_mean": 3.392923713363416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.543048615028965e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 5957.5859375,
+      "completions/mean_terminated_length": 5792.08740234375,
+      "completions/min_length": 1705.0,
+      "completions/min_terminated_length": 1705.0,
+      "entropy": 0.7705951780080795,
+      "epoch": 0.35510579576816925,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021966886706650257,
+      "learning_rate": 1e-05,
+      "loss": 0.0789,
+      "num_tokens": 338324279.0,
+      "reward": 0.53125,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999998927116394,
+      "sampling/importance_sampling_ratio/min": 0.0008041196851991117,
+      "sampling/sampling_logp_difference/max": 7.125762462615967,
+      "sampling/sampling_logp_difference/mean": 0.01804077997803688,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 1.5711350215497077e-05,
+      "clip_ratio/high_mean": 3.927837553874269e-06,
+      "clip_ratio/low_mean": 5.276240381135722e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.669024130838807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7269.8046875,
+      "completions/mean_terminated_length": 7198.03955078125,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 1.0025205165147781,
+      "epoch": 0.3560257589696412,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001694107661023736,
+      "learning_rate": 1e-05,
+      "loss": 0.134,
+      "num_tokens": 339274662.0,
+      "reward": 0.3359375,
+      "reward_std": 0.30487072467803955,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039769172668,
+      "sampling/importance_sampling_ratio/min": 0.0015677008777856827,
+      "sampling/sampling_logp_difference/max": 6.4581451416015625,
+      "sampling/sampling_logp_difference/mean": 0.021742526441812515,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 7.005848829066963e-06,
+      "clip_ratio/high_mean": 1.7514622072667407e-06,
+      "clip_ratio/low_mean": 5.100632029098051e-05,
+      "clip_ratio/low_min": 8.934973720897688e-06,
+      "clip_ratio/region_mean": 5.275778244140383e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7643.8359375,
+      "completions/mean_terminated_length": 7288.54443359375,
+      "completions/min_length": 1061.0,
+      "completions/min_terminated_length": 1061.0,
+      "entropy": 0.7936615869402885,
+      "epoch": 0.35694572217111314,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004587972536683083,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 340272689.0,
+      "reward": 0.5078125,
+      "reward_std": 0.35324612259864807,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999613761901855,
+      "sampling/importance_sampling_ratio/min": 0.0007390327518805861,
+      "sampling/sampling_logp_difference/max": 7.210168361663818,
+      "sampling/sampling_logp_difference/mean": 0.01862112432718277,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 1.0522736374696251e-05,
+      "clip_ratio/high_mean": 2.6306840936740628e-06,
+      "clip_ratio/low_mean": 2.139122614153166e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4021910121518886e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14401.0,
+      "completions/mean_length": 7068.734375,
+      "completions/mean_terminated_length": 6610.60595703125,
+      "completions/min_length": 775.0,
+      "completions/min_terminated_length": 775.0,
+      "entropy": 0.8858344480395317,
+      "epoch": 0.3578656853725851,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00245783943682909,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 341195599.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21594557166099548,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957263469696,
+      "sampling/importance_sampling_ratio/min": 1.526316918898374e-05,
+      "sampling/sampling_logp_difference/max": 11.090067863464355,
+      "sampling/sampling_logp_difference/mean": 0.019989900290966034,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 5.272259386401856e-06,
+      "clip_ratio/high_mean": 1.318064846600464e-06,
+      "clip_ratio/low_mean": 2.2939096254503966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4257160987417592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15788.0,
+      "completions/mean_length": 6093.296875,
+      "completions/mean_terminated_length": 5929.95263671875,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.9640207663178444,
+      "epoch": 0.35878564857405704,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0067657483741641045,
+      "learning_rate": 1e-05,
+      "loss": 0.0181,
+      "num_tokens": 341993565.0,
+      "reward": 0.4453125,
+      "reward_std": 0.12415502220392227,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998992681503296,
+      "sampling/importance_sampling_ratio/min": 0.010459281504154205,
+      "sampling/sampling_logp_difference/max": 4.56026554107666,
+      "sampling/sampling_logp_difference/mean": 0.02037961222231388,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.566248594528588e-05,
+      "clip_ratio/low_min": 4.402028480399167e-06,
+      "clip_ratio/region_mean": 4.566248594528588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16170.0,
+      "completions/max_terminated_length": 16170.0,
+      "completions/mean_length": 7620.09375,
+      "completions/mean_terminated_length": 7620.09375,
+      "completions/min_length": 1076.0,
+      "completions/min_terminated_length": 1076.0,
+      "entropy": 0.9773544892668724,
+      "epoch": 0.35970561177552896,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018817185191437602,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 342990545.0,
+      "reward": 0.3046875,
+      "reward_std": 0.18755048513412476,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0006883936002850533,
+      "sampling/sampling_logp_difference/max": 7.281149864196777,
+      "sampling/sampling_logp_difference/mean": 0.021528441458940506,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 2.6727505428425502e-05,
+      "clip_ratio/high_mean": 7.985045499481203e-06,
+      "clip_ratio/low_mean": 7.762144696243922e-05,
+      "clip_ratio/low_min": 2.4772080450929934e-05,
+      "clip_ratio/region_mean": 8.560649303035461e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15053.0,
+      "completions/mean_length": 6963.984375,
+      "completions/mean_terminated_length": 6737.904296875,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 0.9683744385838509,
+      "epoch": 0.36062557497700093,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052104732021689415,
+      "learning_rate": 1e-05,
+      "loss": 0.087,
+      "num_tokens": 343898791.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324679374695,
+      "sampling/importance_sampling_ratio/min": 0.010815954767167568,
+      "sampling/sampling_logp_difference/max": 4.526732921600342,
+      "sampling/sampling_logp_difference/mean": 0.021434593945741653,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 1.3545108686230378e-05,
+      "clip_ratio/high_mean": 4.365133804640209e-06,
+      "clip_ratio/low_mean": 2.5377692509209737e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9742826200163108e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15116.0,
+      "completions/mean_length": 6718.5078125,
+      "completions/mean_terminated_length": 6642.4013671875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9043834507465363,
+      "epoch": 0.36154553817847285,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005151392426341772,
+      "learning_rate": 1e-05,
+      "loss": 0.0085,
+      "num_tokens": 344779672.0,
+      "reward": 0.4921875,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999840497970581,
+      "sampling/importance_sampling_ratio/min": 0.0024171893019229174,
+      "sampling/sampling_logp_difference/max": 6.025149822235107,
+      "sampling/sampling_logp_difference/mean": 0.0201373603194952,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 1.2263486723895767e-05,
+      "clip_ratio/high_mean": 3.927679188109323e-06,
+      "clip_ratio/low_mean": 2.739263118201052e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132031042696326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16342.0,
+      "completions/mean_length": 7044.640625,
+      "completions/mean_terminated_length": 6820.49609375,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "entropy": 0.9017335474491119,
+      "epoch": 0.3624655013799448,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026606651954352856,
+      "learning_rate": 1e-05,
+      "loss": 0.0554,
+      "num_tokens": 345701722.0,
+      "reward": 0.3125,
+      "reward_std": 0.24146249890327454,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05,
+      "sampling/sampling_logp_difference/max": 10.157968521118164,
+      "sampling/sampling_logp_difference/mean": 0.01981864869594574,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 1.026556356009678e-05,
+      "clip_ratio/high_mean": 2.566390890024195e-06,
+      "clip_ratio/low_mean": 4.819571529424138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0762106297952414e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15476.0,
+      "completions/mean_length": 6031.875,
+      "completions/mean_terminated_length": 5950.3623046875,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.8537683561444283,
+      "epoch": 0.36338546458141674,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003957017324864864,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 346492810.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999707341194153,
+      "sampling/importance_sampling_ratio/min": 0.0015133036067709327,
+      "sampling/sampling_logp_difference/max": 6.493460178375244,
+      "sampling/sampling_logp_difference/mean": 0.018711457028985023,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 5.870488848813693e-06,
+      "clip_ratio/high_mean": 1.4676222122034233e-06,
+      "clip_ratio/low_mean": 3.637038832948747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.783801014378696e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15942.0,
+      "completions/mean_length": 7429.3515625,
+      "completions/mean_terminated_length": 6911.31396484375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.8821266070008278,
+      "epoch": 0.36430542778288866,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002122648525983095,
+      "learning_rate": 1e-05,
+      "loss": 0.1257,
+      "num_tokens": 347462871.0,
+      "reward": 0.453125,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000076293945312,
+      "sampling/importance_sampling_ratio/min": 0.00014005196862854064,
+      "sampling/sampling_logp_difference/max": 8.873497009277344,
+      "sampling/sampling_logp_difference/mean": 0.01998838409781456,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 1.0663932243915042e-05,
+      "clip_ratio/high_mean": 2.6659830609787605e-06,
+      "clip_ratio/low_mean": 6.443337406381033e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.709935701110226e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15761.0,
+      "completions/mean_length": 7131.7109375,
+      "completions/mean_terminated_length": 6833.25,
+      "completions/min_length": 821.0,
+      "completions/min_terminated_length": 821.0,
+      "entropy": 0.8575824722647667,
+      "epoch": 0.36522539098436063,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002546454081311822,
+      "learning_rate": 1e-05,
+      "loss": 0.0676,
+      "num_tokens": 348395842.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999964714050293,
+      "sampling/importance_sampling_ratio/min": 0.0002167800412280485,
+      "sampling/sampling_logp_difference/max": 8.436627388000488,
+      "sampling/sampling_logp_difference/mean": 0.0193922221660614,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 3.847337666229578e-06,
+      "clip_ratio/high_mean": 9.618344165573944e-07,
+      "clip_ratio/low_mean": 3.932982110654848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.029165563679271e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16200.0,
+      "completions/mean_length": 6858.34375,
+      "completions/mean_terminated_length": 6707.14306640625,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.9539813920855522,
+      "epoch": 0.36614535418583255,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00492837093770504,
+      "learning_rate": 1e-05,
+      "loss": 0.0818,
+      "num_tokens": 349292790.0,
+      "reward": 0.390625,
+      "reward_std": 0.1949220597743988,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998850226402283,
+      "sampling/importance_sampling_ratio/min": 0.0011153683299198747,
+      "sampling/sampling_logp_difference/max": 6.79857063293457,
+      "sampling/sampling_logp_difference/mean": 0.020318543538451195,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 1.291372609557584e-05,
+      "clip_ratio/high_mean": 3.22843152389396e-06,
+      "clip_ratio/low_mean": 3.8245348378040944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1473780811429606e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15261.0,
+      "completions/mean_length": 7809.984375,
+      "completions/mean_terminated_length": 7533.40283203125,
+      "completions/min_length": 1002.0,
+      "completions/min_terminated_length": 1002.0,
+      "entropy": 0.8353303670883179,
+      "epoch": 0.3670653173873045,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004895905964076519,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 350312556.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22567616403102875,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999260306358337,
+      "sampling/importance_sampling_ratio/min": 0.0008417933131568134,
+      "sampling/sampling_logp_difference/max": 7.0799760818481445,
+      "sampling/sampling_logp_difference/mean": 0.018754083663225174,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 1.1250081115576904e-05,
+      "clip_ratio/high_mean": 3.5690324011738994e-06,
+      "clip_ratio/low_mean": 3.196108968950284e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.553012152224255e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15057.0,
+      "completions/mean_length": 7194.9296875,
+      "completions/mean_terminated_length": 6821.39013671875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9744522422552109,
+      "epoch": 0.36798528058877644,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0032397822942584753,
+      "learning_rate": 1e-05,
+      "loss": 0.0402,
+      "num_tokens": 351252755.0,
+      "reward": 0.421875,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998766183853149,
+      "sampling/importance_sampling_ratio/min": 0.00023159870761446655,
+      "sampling/sampling_logp_difference/max": 8.370504379272461,
+      "sampling/sampling_logp_difference/mean": 0.02105094864964485,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 6.980455509619787e-06,
+      "clip_ratio/high_mean": 1.7451138774049468e-06,
+      "clip_ratio/low_mean": 2.2670621888210007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.441573599298863e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 6836.234375,
+      "completions/mean_terminated_length": 6607.08837890625,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.9149863049387932,
+      "epoch": 0.3689052437902484,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031576494220644236,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 352145873.0,
+      "reward": 0.3671875,
+      "reward_std": 0.22225630283355713,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999266862869263,
+      "sampling/importance_sampling_ratio/min": 0.0011975533561781049,
+      "sampling/sampling_logp_difference/max": 6.727474689483643,
+      "sampling/sampling_logp_difference/mean": 0.020445333793759346,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 2.3557336589874467e-05,
+      "clip_ratio/high_mean": 5.889334147468617e-06,
+      "clip_ratio/low_mean": 5.359988131203863e-05,
+      "clip_ratio/low_min": 1.3856095392839052e-05,
+      "clip_ratio/region_mean": 5.9489215118446737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 6942.65625,
+      "completions/mean_terminated_length": 6638.0966796875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "entropy": 0.7541583999991417,
+      "epoch": 0.36982520699172033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003970830701291561,
+      "learning_rate": 1e-05,
+      "loss": 0.051,
+      "num_tokens": 353056405.0,
+      "reward": 0.453125,
+      "reward_std": 0.3282659649848938,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 8.399576472584158e-06,
+      "sampling/sampling_logp_difference/max": 11.687329292297363,
+      "sampling/sampling_logp_difference/mean": 0.018101349472999573,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 2.6139805413549766e-05,
+      "clip_ratio/high_mean": 7.517377525800839e-06,
+      "clip_ratio/low_mean": 1.968103515537223e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7198412681173068e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14786.0,
+      "completions/max_terminated_length": 14786.0,
+      "completions/mean_length": 6022.1875,
+      "completions/mean_terminated_length": 6022.1875,
+      "completions/min_length": 1285.0,
+      "completions/min_terminated_length": 1285.0,
+      "entropy": 0.9535745903849602,
+      "epoch": 0.37074517019319225,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0043656788766384125,
+      "learning_rate": 1e-05,
+      "loss": 0.029,
+      "num_tokens": 353844661.0,
+      "reward": 0.4140625,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.04981832951307297,
+      "sampling/sampling_logp_difference/max": 2.9993722438812256,
+      "sampling/sampling_logp_difference/mean": 0.020655371248722076,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 9.152076700047473e-06,
+      "clip_ratio/high_mean": 2.9508817647183605e-06,
+      "clip_ratio/low_mean": 5.21388310517068e-05,
+      "clip_ratio/low_min": 2.633131089169183e-06,
+      "clip_ratio/region_mean": 5.508971298695542e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15906.0,
+      "completions/mean_length": 8068.96875,
+      "completions/mean_terminated_length": 7869.408203125,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.9473539590835571,
+      "epoch": 0.3716651333946642,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006543307099491358,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 354894689.0,
+      "reward": 0.2578125,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999514818191528,
+      "sampling/importance_sampling_ratio/min": 6.672408926533535e-05,
+      "sampling/sampling_logp_difference/max": 9.614944458007812,
+      "sampling/sampling_logp_difference/mean": 0.021852033212780952,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 2.9619268843816826e-05,
+      "clip_ratio/high_mean": 7.4048172109542065e-06,
+      "clip_ratio/low_mean": 5.5152235972855124e-05,
+      "clip_ratio/low_min": 1.0455875781190116e-05,
+      "clip_ratio/region_mean": 6.255705375224352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15748.0,
+      "completions/mean_length": 5960.1875,
+      "completions/mean_terminated_length": 5878.1103515625,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "entropy": 0.9564141109585762,
+      "epoch": 0.37258509659613614,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003351036459207535,
+      "learning_rate": 1e-05,
+      "loss": 0.0293,
+      "num_tokens": 355677273.0,
+      "reward": 0.46875,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999220371246338,
+      "sampling/importance_sampling_ratio/min": 0.0012859756825491786,
+      "sampling/sampling_logp_difference/max": 6.656237602233887,
+      "sampling/sampling_logp_difference/mean": 0.021779976785182953,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 7.957685966175632e-06,
+      "clip_ratio/high_mean": 1.989421491543908e-06,
+      "clip_ratio/low_mean": 3.758041248147492e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.956983414354909e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15669.0,
+      "completions/mean_length": 7620.21875,
+      "completions/mean_terminated_length": 7189.212890625,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 1.035948596894741,
+      "epoch": 0.3735050597976081,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031219006050378084,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 356675829.0,
+      "reward": 0.296875,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001060962677002,
+      "sampling/importance_sampling_ratio/min": 0.010141897015273571,
+      "sampling/sampling_logp_difference/max": 4.591080188751221,
+      "sampling/sampling_logp_difference/mean": 0.021951109170913696,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 2.286768199155631e-05,
+      "clip_ratio/high_mean": 5.7169204978890775e-06,
+      "clip_ratio/low_mean": 3.914574369900947e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.486266482217616e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14038.0,
+      "completions/mean_length": 5806.0234375,
+      "completions/mean_terminated_length": 5638.119140625,
+      "completions/min_length": 1319.0,
+      "completions/min_terminated_length": 1319.0,
+      "entropy": 0.8977029845118523,
+      "epoch": 0.37442502299908004,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002810312667861581,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 357438712.0,
+      "reward": 0.546875,
+      "reward_std": 0.22832970321178436,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999280571937561,
+      "sampling/importance_sampling_ratio/min": 0.0011738575994968414,
+      "sampling/sampling_logp_difference/max": 6.747459888458252,
+      "sampling/sampling_logp_difference/mean": 0.01965375244617462,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 1.2219379641464911e-05,
+      "clip_ratio/high_mean": 3.054844910366228e-06,
+      "clip_ratio/low_mean": 3.186109779562685e-05,
+      "clip_ratio/low_min": 4.3511558942554984e-06,
+      "clip_ratio/region_mean": 3.4915943160740426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15705.0,
+      "completions/max_terminated_length": 15705.0,
+      "completions/mean_length": 6537.4609375,
+      "completions/mean_terminated_length": 6537.4609375,
+      "completions/min_length": 842.0,
+      "completions/min_terminated_length": 842.0,
+      "entropy": 0.9577726796269417,
+      "epoch": 0.37534498620055196,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004516562446951866,
+      "learning_rate": 1e-05,
+      "loss": 0.0517,
+      "num_tokens": 358296731.0,
+      "reward": 0.3828125,
+      "reward_std": 0.1830746978521347,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999170303344727,
+      "sampling/importance_sampling_ratio/min": 2.384942035860149e-06,
+      "sampling/sampling_logp_difference/max": 12.946335792541504,
+      "sampling/sampling_logp_difference/mean": 0.021242395043373108,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 1.4422689218918094e-05,
+      "clip_ratio/high_mean": 3.6056723047295236e-06,
+      "clip_ratio/low_mean": 3.026239573955536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3868068385345396e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16360.0,
+      "completions/mean_length": 7896.671875,
+      "completions/mean_terminated_length": 7622.88671875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.9163230583071709,
+      "epoch": 0.37626494940202393,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003542230697348714,
+      "learning_rate": 1e-05,
+      "loss": 0.05,
+      "num_tokens": 359327001.0,
+      "reward": 0.375,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998560547828674,
+      "sampling/importance_sampling_ratio/min": 0.00010891625424847007,
+      "sampling/sampling_logp_difference/max": 9.124931335449219,
+      "sampling/sampling_logp_difference/mean": 0.020085681229829788,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 1.7827243254942005e-05,
+      "clip_ratio/high_mean": 5.474494003010477e-06,
+      "clip_ratio/low_mean": 4.2465159026505717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.793965263161226e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15297.0,
+      "completions/mean_length": 6728.7109375,
+      "completions/mean_terminated_length": 6652.68505859375,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9010183215141296,
+      "epoch": 0.37718491260349585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0035069347359240055,
+      "learning_rate": 1e-05,
+      "loss": 0.0518,
+      "num_tokens": 360208780.0,
+      "reward": 0.5390625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999571442604065,
+      "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05,
+      "sampling/sampling_logp_difference/max": 11.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.021022530272603035,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 1.0376989393989788e-05,
+      "clip_ratio/high_mean": 2.594247348497447e-06,
+      "clip_ratio/low_mean": 2.8587513156708155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1181759936771414e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6800.3984375,
+      "completions/mean_terminated_length": 6491.25,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.8654960840940475,
+      "epoch": 0.3781048758049678,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033910400234162807,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 361098567.0,
+      "reward": 0.5625,
+      "reward_std": 0.2306838035583496,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998576641082764,
+      "sampling/importance_sampling_ratio/min": 0.001449413481168449,
+      "sampling/sampling_logp_difference/max": 6.536596298217773,
+      "sampling/sampling_logp_difference/mean": 0.019660964608192444,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 2.3068858354236e-05,
+      "clip_ratio/high_mean": 7.792090059410839e-06,
+      "clip_ratio/low_mean": 5.8515578757578623e-05,
+      "clip_ratio/low_min": 1.0348648629587842e-05,
+      "clip_ratio/region_mean": 6.630766870330262e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7103.4453125,
+      "completions/mean_terminated_length": 6956.13525390625,
+      "completions/min_length": 1711.0,
+      "completions/min_terminated_length": 1711.0,
+      "entropy": 0.8317076042294502,
+      "epoch": 0.37902483900643974,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036110079381614923,
+      "learning_rate": 1e-05,
+      "loss": 0.0834,
+      "num_tokens": 362027520.0,
+      "reward": 0.546875,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999338984489441,
+      "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05,
+      "sampling/sampling_logp_difference/max": 11.458046913146973,
+      "sampling/sampling_logp_difference/mean": 0.01939362846314907,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 3.112394779236638e-06,
+      "clip_ratio/high_mean": 7.780986948091595e-07,
+      "clip_ratio/low_mean": 5.127149995587388e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.204959859383962e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15830.0,
+      "completions/mean_length": 7344.9296875,
+      "completions/mean_terminated_length": 6900.384765625,
+      "completions/min_length": 1368.0,
+      "completions/min_terminated_length": 1368.0,
+      "entropy": 0.8387318029999733,
+      "epoch": 0.37994480220791166,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002141098491847515,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 362985207.0,
+      "reward": 0.34375,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999322891235352,
+      "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05,
+      "sampling/sampling_logp_difference/max": 10.874617576599121,
+      "sampling/sampling_logp_difference/mean": 0.01929464004933834,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 5.2602786126954015e-06,
+      "clip_ratio/high_mean": 1.3150696531738504e-06,
+      "clip_ratio/low_mean": 1.7854434247510653e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9169503786997666e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16137.0,
+      "completions/mean_length": 6377.7734375,
+      "completions/mean_terminated_length": 6218.94482421875,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9732858911156654,
+      "epoch": 0.38086476540938363,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015244127716869116,
+      "learning_rate": 1e-05,
+      "loss": 0.0608,
+      "num_tokens": 363823914.0,
+      "reward": 0.4375,
+      "reward_std": 0.1988610327243805,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 0.006335465237498283,
+      "sampling/sampling_logp_difference/max": 5.061592102050781,
+      "sampling/sampling_logp_difference/mean": 0.020688029006123543,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 2.6195500595349586e-05,
+      "clip_ratio/high_mean": 6.548875148837396e-06,
+      "clip_ratio/low_mean": 3.3802934012783226e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035180882056011e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14456.0,
+      "completions/mean_length": 5599.7890625,
+      "completions/mean_terminated_length": 5340.96826171875,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8872368410229683,
+      "epoch": 0.38178472861085555,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002647512126713991,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 364561127.0,
+      "reward": 0.453125,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999077916145325,
+      "sampling/importance_sampling_ratio/min": 2.370526999584399e-06,
+      "sampling/sampling_logp_difference/max": 12.952398300170898,
+      "sampling/sampling_logp_difference/mean": 0.01878243312239647,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 2.157278959202813e-05,
+      "clip_ratio/high_mean": 5.3931973980070325e-06,
+      "clip_ratio/low_mean": 7.215861739950924e-05,
+      "clip_ratio/low_min": 1.4898997051204788e-05,
+      "clip_ratio/region_mean": 7.755181559332414e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15905.0,
+      "completions/mean_length": 7877.2890625,
+      "completions/mean_terminated_length": 7385.1650390625,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.8416353687644005,
+      "epoch": 0.3827046918123275,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018051012884825468,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 365590124.0,
+      "reward": 0.3125,
+      "reward_std": 0.28407180309295654,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.0004095165350008756,
+      "sampling/sampling_logp_difference/max": 7.800533294677734,
+      "sampling/sampling_logp_difference/mean": 0.019809434190392494,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 2.540994637456606e-05,
+      "clip_ratio/high_mean": 6.352486593641515e-06,
+      "clip_ratio/low_mean": 4.230594890941575e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8658435844117776e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16083.0,
+      "completions/mean_length": 6836.7890625,
+      "completions/mean_terminated_length": 6200.30859375,
+      "completions/min_length": 909.0,
+      "completions/min_terminated_length": 909.0,
+      "entropy": 0.8647575601935387,
+      "epoch": 0.38362465501379944,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004550795070827007,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 366486337.0,
+      "reward": 0.40625,
+      "reward_std": 0.22620806097984314,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873638153076,
+      "sampling/importance_sampling_ratio/min": 0.0001089095021598041,
+      "sampling/sampling_logp_difference/max": 9.124993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01992485672235489,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 1.1592664577619871e-05,
+      "clip_ratio/high_mean": 2.8981661444049678e-06,
+      "clip_ratio/low_mean": 3.5717548257707676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.861571451579948e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16286.0,
+      "completions/mean_length": 6884.953125,
+      "completions/mean_terminated_length": 6417.78662109375,
+      "completions/min_length": 1289.0,
+      "completions/min_terminated_length": 1289.0,
+      "entropy": 0.8691708743572235,
+      "epoch": 0.3845446182152714,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005958946421742439,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 367386163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000011920928955,
+      "sampling/importance_sampling_ratio/min": 9.519772902422119e-06,
+      "sampling/sampling_logp_difference/max": 11.562139511108398,
+      "sampling/sampling_logp_difference/mean": 0.019436441361904144,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 2.7658640192385064e-05,
+      "clip_ratio/high_mean": 8.455849524580117e-06,
+      "clip_ratio/low_mean": 3.938097847822064e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7836828116487595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15574.0,
+      "completions/mean_length": 7439.1328125,
+      "completions/mean_terminated_length": 7150.58837890625,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 0.795464999973774,
+      "epoch": 0.38546458141674333,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00558120384812355,
+      "learning_rate": 1e-05,
+      "loss": 0.1918,
+      "num_tokens": 368357500.0,
+      "reward": 0.609375,
+      "reward_std": 0.3795146346092224,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.0001159337698481977,
+      "sampling/sampling_logp_difference/max": 9.062491416931152,
+      "sampling/sampling_logp_difference/mean": 0.018824251368641853,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 8.509555527780321e-06,
+      "clip_ratio/high_mean": 2.1273888819450804e-06,
+      "clip_ratio/low_mean": 3.0958593640662e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.308598269313734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16236.0,
+      "completions/mean_length": 6751.53125,
+      "completions/mean_terminated_length": 6520.3525390625,
+      "completions/min_length": 715.0,
+      "completions/min_terminated_length": 715.0,
+      "entropy": 0.9450879693031311,
+      "epoch": 0.38638454461821525,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004628168884664774,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 369242920.0,
+      "reward": 0.359375,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999655485153198,
+      "sampling/importance_sampling_ratio/min": 0.0006074689445085824,
+      "sampling/sampling_logp_difference/max": 7.406209468841553,
+      "sampling/sampling_logp_difference/mean": 0.019376013427972794,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 1.8288420505996328e-05,
+      "clip_ratio/high_mean": 4.572105126499082e-06,
+      "clip_ratio/low_mean": 4.86290555272717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320115997164976e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16164.0,
+      "completions/mean_length": 7023.296875,
+      "completions/mean_terminated_length": 6315.3447265625,
+      "completions/min_length": 1628.0,
+      "completions/min_terminated_length": 1628.0,
+      "entropy": 0.7378111630678177,
+      "epoch": 0.3873045078196872,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00389425759203732,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 370159510.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999127388000488,
+      "sampling/importance_sampling_ratio/min": 0.00014012664905749261,
+      "sampling/sampling_logp_difference/max": 8.872963905334473,
+      "sampling/sampling_logp_difference/mean": 0.016914553940296173,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 2.1269573153404053e-05,
+      "clip_ratio/high_mean": 5.948400371380558e-06,
+      "clip_ratio/low_mean": 2.3538930747690756e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9487331687505502e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16018.0,
+      "completions/max_terminated_length": 16018.0,
+      "completions/mean_length": 7702.3046875,
+      "completions/mean_terminated_length": 7702.3046875,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "entropy": 0.9053447172045708,
+      "epoch": 0.38822447102115915,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004324545152485371,
+      "learning_rate": 1e-05,
+      "loss": 0.0149,
+      "num_tokens": 371162773.0,
+      "reward": 0.2421875,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00001060962677,
+      "sampling/importance_sampling_ratio/min": 2.283278627146501e-05,
+      "sampling/sampling_logp_difference/max": 10.687313079833984,
+      "sampling/sampling_logp_difference/mean": 0.020495830103754997,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 1.0294916819475475e-05,
+      "clip_ratio/high_mean": 2.5737292048688687e-06,
+      "clip_ratio/low_mean": 5.831611520079605e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.088984559937671e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 6904.78125,
+      "completions/mean_terminated_length": 6754.31787109375,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.7991176024079323,
+      "epoch": 0.3891444342226311,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003239463549107313,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "num_tokens": 372067241.0,
+      "reward": 0.328125,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00012340991816017777,
+      "sampling/sampling_logp_difference/max": 8.999999046325684,
+      "sampling/sampling_logp_difference/mean": 0.019042208790779114,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 2.7261318791715894e-05,
+      "clip_ratio/high_mean": 7.926559305815317e-06,
+      "clip_ratio/low_mean": 1.552133551285806e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3447895273420727e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15399.0,
+      "completions/mean_length": 6107.7421875,
+      "completions/mean_terminated_length": 5602.35205078125,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "entropy": 0.9495253190398216,
+      "epoch": 0.39006439742410304,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015464330790564418,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 372866072.0,
+      "reward": 0.421875,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971330165863,
+      "sampling/importance_sampling_ratio/min": 0.00024684349773451686,
+      "sampling/sampling_logp_difference/max": 8.306756019592285,
+      "sampling/sampling_logp_difference/mean": 0.019793221727013588,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 2.457227401464479e-05,
+      "clip_ratio/high_mean": 8.533324717063806e-06,
+      "clip_ratio/low_mean": 3.261690835643094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.115023284612107e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15939.0,
+      "completions/mean_length": 6079.8046875,
+      "completions/mean_terminated_length": 5747.4111328125,
+      "completions/min_length": 1082.0,
+      "completions/min_terminated_length": 1082.0,
+      "entropy": 0.8005363270640373,
+      "epoch": 0.39098436062557496,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024811832699924707,
+      "learning_rate": 1e-05,
+      "loss": 0.1124,
+      "num_tokens": 373663463.0,
+      "reward": 0.625,
+      "reward_std": 0.2630355656147003,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743103981018,
+      "sampling/importance_sampling_ratio/min": 0.00019348970090504736,
+      "sampling/sampling_logp_difference/max": 8.550286293029785,
+      "sampling/sampling_logp_difference/mean": 0.017151469364762306,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 3.3719989005476236e-06,
+      "clip_ratio/high_mean": 8.429997251369059e-07,
+      "clip_ratio/low_mean": 2.132218082806503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2165180553201935e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14925.0,
+      "completions/mean_length": 6453.7890625,
+      "completions/mean_terminated_length": 6375.5986328125,
+      "completions/min_length": 347.0,
+      "completions/min_terminated_length": 347.0,
+      "entropy": 0.9212624430656433,
+      "epoch": 0.39190432382704693,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031475063879042864,
+      "learning_rate": 1e-05,
+      "loss": 0.0959,
+      "num_tokens": 374517492.0,
+      "reward": 0.34375,
+      "reward_std": 0.19910329580307007,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999594688415527,
+      "sampling/importance_sampling_ratio/min": 0.015664709731936455,
+      "sampling/sampling_logp_difference/max": 4.156344890594482,
+      "sampling/sampling_logp_difference/mean": 0.019899867475032806,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 1.907509408738406e-05,
+      "clip_ratio/high_mean": 5.984868664654641e-06,
+      "clip_ratio/low_mean": 3.784128080042137e-05,
+      "clip_ratio/low_min": 3.7751804029539926e-06,
+      "clip_ratio/region_mean": 4.382614952191943e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16159.0,
+      "completions/max_terminated_length": 16159.0,
+      "completions/mean_length": 6126.9921875,
+      "completions/mean_terminated_length": 6126.9921875,
+      "completions/min_length": 1106.0,
+      "completions/min_terminated_length": 1106.0,
+      "entropy": 0.8252849578857422,
+      "epoch": 0.39282428702851885,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004200868774205446,
+      "learning_rate": 1e-05,
+      "loss": 0.0276,
+      "num_tokens": 375320339.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999815225601196,
+      "sampling/importance_sampling_ratio/min": 0.005763276945799589,
+      "sampling/sampling_logp_difference/max": 5.156249046325684,
+      "sampling/sampling_logp_difference/mean": 0.01833093911409378,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 1.8918785372079583e-05,
+      "clip_ratio/high_mean": 5.476571459439583e-06,
+      "clip_ratio/low_mean": 6.169724406390742e-05,
+      "clip_ratio/low_min": 7.494657666029525e-06,
+      "clip_ratio/region_mean": 6.717381506859965e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15411.0,
+      "completions/mean_length": 6739.09375,
+      "completions/mean_terminated_length": 6427.9677734375,
+      "completions/min_length": 1228.0,
+      "completions/min_terminated_length": 1228.0,
+      "entropy": 0.8008574098348618,
+      "epoch": 0.3937442502299908,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003204014617949724,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 376201015.0,
+      "reward": 0.5390625,
+      "reward_std": 0.37086254358291626,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998303651809692,
+      "sampling/importance_sampling_ratio/min": 0.00010144581028725952,
+      "sampling/sampling_logp_difference/max": 9.195985794067383,
+      "sampling/sampling_logp_difference/mean": 0.018961725756525993,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 1.3558789078160771e-05,
+      "clip_ratio/high_mean": 3.389697269540193e-06,
+      "clip_ratio/low_mean": 5.3925050679026754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.731474743697618e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15634.0,
+      "completions/mean_length": 7245.8984375,
+      "completions/mean_terminated_length": 6951.12060546875,
+      "completions/min_length": 1306.0,
+      "completions/min_terminated_length": 1306.0,
+      "entropy": 1.0351596996188164,
+      "epoch": 0.39466421343146274,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0039763906970620155,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 377149650.0,
+      "reward": 0.375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000600814819336,
+      "sampling/importance_sampling_ratio/min": 8.106228051474318e-05,
+      "sampling/sampling_logp_difference/max": 9.420292854309082,
+      "sampling/sampling_logp_difference/mean": 0.020948028191924095,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 1.4580486549675697e-05,
+      "clip_ratio/high_mean": 4.259903903403028e-06,
+      "clip_ratio/low_mean": 4.6149686397711775e-05,
+      "clip_ratio/low_min": 3.006686938533676e-06,
+      "clip_ratio/region_mean": 5.04095905853319e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 6958.625,
+      "completions/mean_terminated_length": 6495.08154296875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.8360240310430527,
+      "epoch": 0.39558417663293466,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0031417158897966146,
+      "learning_rate": 1e-05,
+      "loss": 0.0195,
+      "num_tokens": 378057802.0,
+      "reward": 0.515625,
+      "reward_std": 0.35771697759628296,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999384880065918,
+      "sampling/importance_sampling_ratio/min": 0.00010235882655251771,
+      "sampling/sampling_logp_difference/max": 9.187026023864746,
+      "sampling/sampling_logp_difference/mean": 0.019185224547982216,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 6.681633749394678e-06,
+      "clip_ratio/high_mean": 1.6704084373486694e-06,
+      "clip_ratio/low_mean": 5.096616632727091e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.263657521936693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15410.0,
+      "completions/max_terminated_length": 15410.0,
+      "completions/mean_length": 5696.3984375,
+      "completions/mean_terminated_length": 5696.3984375,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.7887749597430229,
+      "epoch": 0.39650413983440663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004943124484270811,
+      "learning_rate": 1e-05,
+      "loss": 0.096,
+      "num_tokens": 378808021.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999057054519653,
+      "sampling/importance_sampling_ratio/min": 0.0015042300801724195,
+      "sampling/sampling_logp_difference/max": 6.499474048614502,
+      "sampling/sampling_logp_difference/mean": 0.018845941871404648,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 1.7526824194646906e-05,
+      "clip_ratio/high_mean": 5.417880970526312e-06,
+      "clip_ratio/low_mean": 3.513921649300755e-05,
+      "clip_ratio/low_min": 6.075038982089609e-06,
+      "clip_ratio/region_mean": 4.0557096895099676e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14233.0,
+      "completions/mean_length": 6480.8828125,
+      "completions/mean_terminated_length": 6323.69091796875,
+      "completions/min_length": 1013.0,
+      "completions/min_terminated_length": 1013.0,
+      "entropy": 0.8796411231160164,
+      "epoch": 0.39742410303587855,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00595651101320982,
+      "learning_rate": 1e-05,
+      "loss": 0.0546,
+      "num_tokens": 379659710.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 0.0017907419241964817,
+      "sampling/sampling_logp_difference/max": 6.325125217437744,
+      "sampling/sampling_logp_difference/mean": 0.01906527951359749,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4512424602107785e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4512424602107785e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7501.703125,
+      "completions/mean_terminated_length": 6829.93310546875,
+      "completions/min_length": 680.0,
+      "completions/min_terminated_length": 680.0,
+      "entropy": 0.786028303205967,
+      "epoch": 0.3983440662373505,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0024527597706764936,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 380640720.0,
+      "reward": 0.5234375,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999595880508423,
+      "sampling/importance_sampling_ratio/min": 8.851602615322918e-07,
+      "sampling/sampling_logp_difference/max": 13.93749713897705,
+      "sampling/sampling_logp_difference/mean": 0.01873261108994484,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 1.4606259583160863e-05,
+      "clip_ratio/high_mean": 5.505394312876888e-06,
+      "clip_ratio/low_mean": 3.1679782978244475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7185177234277944e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15185.0,
+      "completions/mean_length": 5619.2890625,
+      "completions/mean_terminated_length": 5448.4208984375,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.8098893761634827,
+      "epoch": 0.39926402943882244,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004280989523977041,
+      "learning_rate": 1e-05,
+      "loss": 0.0514,
+      "num_tokens": 381377981.0,
+      "reward": 0.609375,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999443292617798,
+      "sampling/importance_sampling_ratio/min": 0.0010248658945783973,
+      "sampling/sampling_logp_difference/max": 6.883193492889404,
+      "sampling/sampling_logp_difference/mean": 0.017923470586538315,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 1.4808703554081148e-05,
+      "clip_ratio/high_mean": 3.702175888520287e-06,
+      "clip_ratio/low_mean": 2.3637440563106793e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7339616224253405e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5243.8203125,
+      "completions/mean_terminated_length": 5156.1025390625,
+      "completions/min_length": 576.0,
+      "completions/min_terminated_length": 576.0,
+      "entropy": 0.7485036551952362,
+      "epoch": 0.40018399264029436,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004721642471849918,
+      "learning_rate": 1e-05,
+      "loss": 0.0877,
+      "num_tokens": 382070478.0,
+      "reward": 0.6875,
+      "reward_std": 0.26538965106010437,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999414086341858,
+      "sampling/importance_sampling_ratio/min": 0.0011518355458974838,
+      "sampling/sampling_logp_difference/max": 6.7663984298706055,
+      "sampling/sampling_logp_difference/mean": 0.016579966992139816,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 3.1177480195765384e-05,
+      "clip_ratio/high_mean": 1.1174359769938746e-05,
+      "clip_ratio/low_mean": 3.602651599976525e-05,
+      "clip_ratio/low_min": 4.348733455117326e-06,
+      "clip_ratio/region_mean": 4.720087713394605e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15978.0,
+      "completions/mean_length": 7021.1796875,
+      "completions/mean_terminated_length": 6872.56396484375,
+      "completions/min_length": 1371.0,
+      "completions/min_terminated_length": 1371.0,
+      "entropy": 0.8693460151553154,
+      "epoch": 0.40110395584176634,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00329192029312253,
+      "learning_rate": 1e-05,
+      "loss": 0.0342,
+      "num_tokens": 382990245.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999822378158569,
+      "sampling/importance_sampling_ratio/min": 0.0023386883549392223,
+      "sampling/sampling_logp_difference/max": 6.058165073394775,
+      "sampling/sampling_logp_difference/mean": 0.019863136112689972,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 1.1192694955752813e-05,
+      "clip_ratio/high_mean": 2.7981737389382033e-06,
+      "clip_ratio/low_mean": 4.9078003257818636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.1876177280973934e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15344.0,
+      "completions/mean_length": 6917.625,
+      "completions/mean_terminated_length": 6452.0654296875,
+      "completions/min_length": 945.0,
+      "completions/min_terminated_length": 945.0,
+      "entropy": 0.8466897681355476,
+      "epoch": 0.40202391904323825,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0051889242604374886,
+      "learning_rate": 1e-05,
+      "loss": 0.1009,
+      "num_tokens": 383896717.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999983310699463,
+      "sampling/importance_sampling_ratio/min": 0.00015846389578655362,
+      "sampling/sampling_logp_difference/max": 8.749983787536621,
+      "sampling/sampling_logp_difference/mean": 0.019528398290276527,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 2.3224948108691024e-05,
+      "clip_ratio/high_mean": 8.263948757303297e-06,
+      "clip_ratio/low_mean": 3.8556312347282073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.682026019509067e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16175.0,
+      "completions/mean_length": 7487.5078125,
+      "completions/mean_terminated_length": 7346.2939453125,
+      "completions/min_length": 877.0,
+      "completions/min_terminated_length": 877.0,
+      "entropy": 0.9584660083055496,
+      "epoch": 0.4029438822447102,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002855573548004031,
+      "learning_rate": 1e-05,
+      "loss": 0.0087,
+      "num_tokens": 384872622.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2477683424949646,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999386668205261,
+      "sampling/importance_sampling_ratio/min": 0.0038593418430536985,
+      "sampling/sampling_logp_difference/max": 5.557258605957031,
+      "sampling/sampling_logp_difference/mean": 0.0209865253418684,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 6.171620498207631e-06,
+      "clip_ratio/high_mean": 1.5429051245519076e-06,
+      "clip_ratio/low_mean": 2.98128834401723e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.135578845103737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16092.0,
+      "completions/mean_length": 6637.5078125,
+      "completions/mean_terminated_length": 6323.1044921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 0.8841215297579765,
+      "epoch": 0.40386384544618215,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004437311552464962,
+      "learning_rate": 1e-05,
+      "loss": 0.0523,
+      "num_tokens": 385744023.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999136924743652,
+      "sampling/importance_sampling_ratio/min": 0.002925124252215028,
+      "sampling/sampling_logp_difference/max": 5.834418296813965,
+      "sampling/sampling_logp_difference/mean": 0.019490888342261314,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 1.3304874300956726e-05,
+      "clip_ratio/high_mean": 3.3262185752391815e-06,
+      "clip_ratio/low_mean": 5.443932013804442e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.776553894065728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15143.0,
+      "completions/mean_length": 5965.9765625,
+      "completions/mean_terminated_length": 5800.611328125,
+      "completions/min_length": 621.0,
+      "completions/min_terminated_length": 621.0,
+      "entropy": 0.8726934269070625,
+      "epoch": 0.4047838086476541,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002463799435645342,
+      "learning_rate": 1e-05,
+      "loss": -0.0075,
+      "num_tokens": 386525492.0,
+      "reward": 0.3984375,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.00020367901015561074,
+      "sampling/sampling_logp_difference/max": 8.4989652633667,
+      "sampling/sampling_logp_difference/mean": 0.01946769654750824,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 1.0084711902891286e-05,
+      "clip_ratio/high_mean": 3.6154040117253317e-06,
+      "clip_ratio/low_mean": 3.598771945689805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9603123695997056e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6693.109375,
+      "completions/mean_terminated_length": 6616.80322265625,
+      "completions/min_length": 1704.0,
+      "completions/min_terminated_length": 1704.0,
+      "entropy": 0.9430640190839767,
+      "epoch": 0.40570377184912604,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038990566972643137,
+      "learning_rate": 1e-05,
+      "loss": 0.0415,
+      "num_tokens": 387404842.0,
+      "reward": 0.421875,
+      "reward_std": 0.31587693095207214,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999700784683228,
+      "sampling/importance_sampling_ratio/min": 0.0011708902893587947,
+      "sampling/sampling_logp_difference/max": 6.749990940093994,
+      "sampling/sampling_logp_difference/mean": 0.020848294720053673,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 7.462686426151777e-06,
+      "clip_ratio/high_mean": 1.8656716065379442e-06,
+      "clip_ratio/low_mean": 5.234285907818048e-05,
+      "clip_ratio/low_min": 4.47803950009984e-06,
+      "clip_ratio/region_mean": 5.420853057103159e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7045.6953125,
+      "completions/mean_terminated_length": 6505.46240234375,
+      "completions/min_length": 926.0,
+      "completions/min_terminated_length": 926.0,
+      "entropy": 0.8912066072225571,
+      "epoch": 0.40662373505059796,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018510994268581271,
+      "learning_rate": 1e-05,
+      "loss": 0.099,
+      "num_tokens": 388324475.0,
+      "reward": 0.40625,
+      "reward_std": 0.32195523381233215,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999024868011475,
+      "sampling/importance_sampling_ratio/min": 0.0031757301185280085,
+      "sampling/sampling_logp_difference/max": 5.752217769622803,
+      "sampling/sampling_logp_difference/mean": 0.020547039806842804,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 2.504527083146968e-05,
+      "clip_ratio/high_mean": 6.26131770786742e-06,
+      "clip_ratio/low_mean": 6.165269871871715e-05,
+      "clip_ratio/low_min": 3.5272871627967106e-06,
+      "clip_ratio/region_mean": 6.791401551708987e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15734.0,
+      "completions/mean_length": 7480.0078125,
+      "completions/mean_terminated_length": 7266.3125,
+      "completions/min_length": 1130.0,
+      "completions/min_terminated_length": 1130.0,
+      "entropy": 0.8813760280609131,
+      "epoch": 0.40754369825206993,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004439481534063816,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 389305644.0,
+      "reward": 0.34375,
+      "reward_std": 0.31300368905067444,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999762773513794,
+      "sampling/importance_sampling_ratio/min": 0.007449973840266466,
+      "sampling/sampling_logp_difference/max": 4.899544715881348,
+      "sampling/sampling_logp_difference/mean": 0.01973455585539341,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 4.0980917219712865e-06,
+      "clip_ratio/high_mean": 1.0245229304928216e-06,
+      "clip_ratio/low_mean": 3.662567087303614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.76501939172158e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15302.0,
+      "completions/max_terminated_length": 15302.0,
+      "completions/mean_length": 7044.4453125,
+      "completions/mean_terminated_length": 7044.4453125,
+      "completions/min_length": 1229.0,
+      "completions/min_terminated_length": 1229.0,
+      "entropy": 0.9901906549930573,
+      "epoch": 0.40846366145354185,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.004181519150733948,
+      "learning_rate": 1e-05,
+      "loss": -0.0068,
+      "num_tokens": 390229373.0,
+      "reward": 0.421875,
+      "reward_std": 0.17700131237506866,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000314712524414,
+      "sampling/importance_sampling_ratio/min": 0.00022536676260642707,
+      "sampling/sampling_logp_difference/max": 8.397781372070312,
+      "sampling/sampling_logp_difference/mean": 0.021211043000221252,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 1.4909872106727562e-05,
+      "clip_ratio/high_mean": 3.7274680266818905e-06,
+      "clip_ratio/low_mean": 5.29995777469594e-05,
+      "clip_ratio/low_min": 3.708758640641463e-06,
+      "clip_ratio/region_mean": 5.672704537573736e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7815.8125,
+      "completions/mean_terminated_length": 7244.6005859375,
+      "completions/min_length": 1350.0,
+      "completions/min_terminated_length": 1350.0,
+      "entropy": 0.8278292864561081,
+      "epoch": 0.4093836246550138,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002691390924155712,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 391251141.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31222954392433167,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 0.007715471088886261,
+      "sampling/sampling_logp_difference/max": 4.864527702331543,
+      "sampling/sampling_logp_difference/mean": 0.018415704369544983,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 2.1858722902834415e-05,
+      "clip_ratio/high_mean": 6.629899417021079e-06,
+      "clip_ratio/low_mean": 3.196247394043894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.859237290271267e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15202.0,
+      "completions/mean_length": 5305.1796875,
+      "completions/mean_terminated_length": 5217.94482421875,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.8100772425532341,
+      "epoch": 0.41030358785648574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0069543467834591866,
+      "learning_rate": 1e-05,
+      "loss": 0.1153,
+      "num_tokens": 391956196.0,
+      "reward": 0.609375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000190734863281,
+      "sampling/importance_sampling_ratio/min": 0.0024869756307452917,
+      "sampling/sampling_logp_difference/max": 5.996687889099121,
+      "sampling/sampling_logp_difference/mean": 0.017318082973361015,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 2.461934036546154e-05,
+      "clip_ratio/high_mean": 8.056288947955181e-06,
+      "clip_ratio/low_mean": 5.289376917971822e-05,
+      "clip_ratio/low_min": 4.21926688431995e-06,
+      "clip_ratio/region_mean": 6.0950058468733914e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15300.0,
+      "completions/mean_length": 7299.578125,
+      "completions/mean_terminated_length": 6930.29248046875,
+      "completions/min_length": 1008.0,
+      "completions/min_terminated_length": 1008.0,
+      "entropy": 0.9955824315547943,
+      "epoch": 0.41122355105795766,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0065611582249403,
+      "learning_rate": 1e-05,
+      "loss": 0.0883,
+      "num_tokens": 392908430.0,
+      "reward": 0.4375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999696016311646,
+      "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06,
+      "sampling/sampling_logp_difference/max": 11.873339653015137,
+      "sampling/sampling_logp_difference/mean": 0.02127375639975071,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 2.4339562514796853e-05,
+      "clip_ratio/high_mean": 7.412756531266496e-06,
+      "clip_ratio/low_mean": 3.89272447591793e-05,
+      "clip_ratio/low_min": 4.047796210215893e-06,
+      "clip_ratio/region_mean": 4.6340001517819474e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 6702.9375,
+      "completions/mean_terminated_length": 6390.64501953125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.82919991761446,
+      "epoch": 0.41214351425942963,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032975098583847284,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 393788286.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999115467071533,
+      "sampling/importance_sampling_ratio/min": 0.00028582560480572283,
+      "sampling/sampling_logp_difference/max": 8.160128593444824,
+      "sampling/sampling_logp_difference/mean": 0.019461583346128464,
+      "step": 448
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 393788286,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-448/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-448/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/README.md b/dapo_milora_plus_20251201_131939/checkpoint-512/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-512/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-512/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-512/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/latest b/dapo_milora_plus_20251201_131939/checkpoint-512/latest
new file mode 100644
index 0000000000000000000000000000000000000000..35f851ced1a2a2007c68236a52dfc57e513ef909
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-512/latest
@@ -0,0 +1 @@
+global_step512
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-512/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-512/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-512/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-512/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-512/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a09d2337ba5ef356f2482abac5ccca6256e7b984
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-512/trainer_state.json
@@ -0,0 +1,15906 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.47102115915363385,
+  "eval_steps": 500,
+  "global_step": 512,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 9.941184316630824e-06,
+      "clip_ratio/high_mean": 2.485296079157706e-06,
+      "clip_ratio/low_mean": 2.6134909091979353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8620205910101504e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 8426.1015625,
+      "completions/mean_terminated_length": 7965.72705078125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8188603445887566,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030983765609562397,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 162199765.0,
+      "reward": 0.25,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411106109619,
+      "sampling/importance_sampling_ratio/min": 0.0009119694004766643,
+      "sampling/sampling_logp_difference/max": 6.999904155731201,
+      "sampling/sampling_logp_difference/mean": 0.02070600539445877,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 2.612139087432297e-05,
+      "clip_ratio/high_mean": 6.530347718580742e-06,
+      "clip_ratio/low_mean": 3.7853451885894174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.438379949078808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15904.0,
+      "completions/mean_length": 7154.2109375,
+      "completions/mean_terminated_length": 6856.4755859375,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "entropy": 0.9913735538721085,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003430198412388563,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 163133232.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2120065689086914,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000275373458862,
+      "sampling/importance_sampling_ratio/min": 0.00042929715709760785,
+      "sampling/sampling_logp_difference/max": 7.753361225128174,
+      "sampling/sampling_logp_difference/mean": 0.02190260961651802,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 3.1841454983805306e-06,
+      "clip_ratio/high_mean": 7.960363745951327e-07,
+      "clip_ratio/low_mean": 3.384581600585079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4641852380445926e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 7693.1328125,
+      "completions/mean_terminated_length": 7412.7822265625,
+      "completions/min_length": 1077.0,
+      "completions/min_terminated_length": 1077.0,
+      "entropy": 0.9887127950787544,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002780586015433073,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 164134393.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999028444290161,
+      "sampling/importance_sampling_ratio/min": 3.559096626304381e-07,
+      "sampling/sampling_logp_difference/max": 14.848588943481445,
+      "sampling/sampling_logp_difference/mean": 0.021110571920871735,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 9.770586984814145e-06,
+      "clip_ratio/high_mean": 5.008155312680174e-06,
+      "clip_ratio/low_mean": 5.182203130971175e-05,
+      "clip_ratio/low_min": 1.5574546068819473e-05,
+      "clip_ratio/region_mean": 5.683018616764457e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 7072.1484375,
+      "completions/mean_terminated_length": 6771.76611328125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.861792616546154,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030156150460243225,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 165063412.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998926520347595,
+      "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06,
+      "sampling/sampling_logp_difference/max": 12.999247550964355,
+      "sampling/sampling_logp_difference/mean": 0.019325289875268936,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 2.2510209873871645e-05,
+      "clip_ratio/high_mean": 6.455301331698138e-06,
+      "clip_ratio/low_mean": 6.156819108582567e-05,
+      "clip_ratio/low_min": 5.763157332694391e-06,
+      "clip_ratio/region_mean": 6.802349253121065e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15062.0,
+      "completions/mean_length": 7353.421875,
+      "completions/mean_terminated_length": 7062.11279296875,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "entropy": 0.8961873054504395,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034921523183584213,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 166024306.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.0005124400486238301,
+      "sampling/sampling_logp_difference/max": 7.576326847076416,
+      "sampling/sampling_logp_difference/mean": 0.019593238830566406,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.3040991007073899e-05,
+      "clip_ratio/high_mean": 4.292725350296678e-06,
+      "clip_ratio/low_mean": 5.347559840629401e-05,
+      "clip_ratio/low_min": 6.613406640099129e-06,
+      "clip_ratio/region_mean": 5.776832381343411e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15604.0,
+      "completions/mean_length": 7348.03125,
+      "completions/mean_terminated_length": 6903.63916015625,
+      "completions/min_length": 1619.0,
+      "completions/min_terminated_length": 1619.0,
+      "entropy": 0.824029266834259,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027784397825598717,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 166984982.0,
+      "reward": 0.40625,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 0.0010020677000284195,
+      "sampling/sampling_logp_difference/max": 6.905689716339111,
+      "sampling/sampling_logp_difference/mean": 0.01857386901974678,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 3.330808067403268e-05,
+      "clip_ratio/high_mean": 1.0969530649163062e-05,
+      "clip_ratio/low_mean": 3.2080681648949394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3050211388617754e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16358.0,
+      "completions/mean_length": 7290.4765625,
+      "completions/mean_terminated_length": 6920.82080078125,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8884479627013206,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004110465291887522,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 167936971.0,
+      "reward": 0.4375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06,
+      "sampling/sampling_logp_difference/max": 13.219663619995117,
+      "sampling/sampling_logp_difference/mean": 0.019696572795510292,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 9.77357763076725e-06,
+      "clip_ratio/high_mean": 2.4433944076918124e-06,
+      "clip_ratio/low_mean": 3.466498992565903e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.710838473125477e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 7803.625,
+      "completions/mean_terminated_length": 6833.66943359375,
+      "completions/min_length": 929.0,
+      "completions/min_terminated_length": 929.0,
+      "entropy": 0.8326860442757607,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002410614863038063,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "num_tokens": 168955683.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0008801451185718179,
+      "sampling/sampling_logp_difference/max": 7.035423755645752,
+      "sampling/sampling_logp_difference/mean": 0.018545793369412422,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.4602125929741305e-05,
+      "clip_ratio/high_mean": 3.6505314824353263e-06,
+      "clip_ratio/low_mean": 3.4781527119776e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8432058772741584e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6804.34375,
+      "completions/mean_terminated_length": 6495.322265625,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.9669496119022369,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034376555122435093,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 169845823.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31534504890441895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 1.767780588579626e-08,
+      "sampling/sampling_logp_difference/max": 17.850955963134766,
+      "sampling/sampling_logp_difference/mean": 0.020515555515885353,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 1.5814722473805887e-05,
+      "clip_ratio/high_mean": 3.953680618451472e-06,
+      "clip_ratio/low_mean": 3.574208744794305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9695768407455034e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 6827.9609375,
+      "completions/mean_terminated_length": 6105.23583984375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 0.8833946585655212,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026675171684473753,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 170738210.0,
+      "reward": 0.421875,
+      "reward_std": 0.2698654532432556,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000019907951355,
+      "sampling/importance_sampling_ratio/min": 0.002906275913119316,
+      "sampling/sampling_logp_difference/max": 5.840882778167725,
+      "sampling/sampling_logp_difference/mean": 0.019948139786720276,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.6623121837255894e-05,
+      "clip_ratio/high_mean": 4.1557804593139736e-06,
+      "clip_ratio/low_mean": 6.462372630267055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.877950727357529e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15725.0,
+      "completions/mean_length": 7377.984375,
+      "completions/mean_terminated_length": 7307.07080078125,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8881714344024658,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0039620306342840195,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 171705152.0,
+      "reward": 0.3359375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05,
+      "sampling/sampling_logp_difference/max": 10.614632606506348,
+      "sampling/sampling_logp_difference/mean": 0.01964445412158966,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 9.639111340220552e-06,
+      "clip_ratio/high_mean": 2.409777835055138e-06,
+      "clip_ratio/low_mean": 2.775239624952519e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0162174198267167e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15265.0,
+      "completions/mean_length": 6051.8828125,
+      "completions/mean_terminated_length": 5543.74560546875,
+      "completions/min_length": 819.0,
+      "completions/min_terminated_length": 819.0,
+      "entropy": 0.8851477280259132,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0040458571165800095,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 172501881.0,
+      "reward": 0.4296875,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999410510063171,
+      "sampling/importance_sampling_ratio/min": 0.0021976607386022806,
+      "sampling/sampling_logp_difference/max": 6.120361804962158,
+      "sampling/sampling_logp_difference/mean": 0.01957303285598755,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 9.72708312474424e-06,
+      "clip_ratio/high_mean": 3.529455852913088e-06,
+      "clip_ratio/low_mean": 5.158422732165491e-05,
+      "clip_ratio/low_min": 1.1939961495954776e-05,
+      "clip_ratio/region_mean": 5.5113683174567996e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7830.171875,
+      "completions/mean_terminated_length": 7409.4912109375,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.9070459827780724,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005941574461758137,
+      "learning_rate": 1e-05,
+      "loss": 0.0427,
+      "num_tokens": 173522391.0,
+      "reward": 0.34375,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000017881393433,
+      "sampling/importance_sampling_ratio/min": 0.00011712420382536948,
+      "sampling/sampling_logp_difference/max": 9.052275657653809,
+      "sampling/sampling_logp_difference/mean": 0.021295130252838135,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 5.5543214330100454e-06,
+      "clip_ratio/high_mean": 1.3885803582525114e-06,
+      "clip_ratio/low_mean": 1.718775109793569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8576331683561875e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15443.0,
+      "completions/mean_length": 7520.6796875,
+      "completions/mean_terminated_length": 6769.55078125,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.8843575045466423,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025851845275610685,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 174504534.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 0.00039556476986035705,
+      "sampling/sampling_logp_difference/max": 7.835196018218994,
+      "sampling/sampling_logp_difference/mean": 0.02016005665063858,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 1.0145481155632297e-05,
+      "clip_ratio/high_mean": 2.536370288908074e-06,
+      "clip_ratio/low_mean": 3.617897255026037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.871534295285528e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 7382.1875,
+      "completions/mean_terminated_length": 6861.42138671875,
+      "completions/min_length": 934.0,
+      "completions/min_terminated_length": 934.0,
+      "entropy": 0.916313610970974,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004170550964772701,
+      "learning_rate": 1e-05,
+      "loss": 0.047,
+      "num_tokens": 175472574.0,
+      "reward": 0.46875,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05,
+      "sampling/sampling_logp_difference/max": 10.481352806091309,
+      "sampling/sampling_logp_difference/mean": 0.020749717950820923,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.83663013963087e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.83663013963087e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 6122.453125,
+      "completions/mean_terminated_length": 6041.6533203125,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.8984386026859283,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 176275568.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 7.88934721640544e-06,
+      "sampling/sampling_logp_difference/max": 11.74999713897705,
+      "sampling/sampling_logp_difference/mean": 0.020278753712773323,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 1.4535152331518475e-05,
+      "clip_ratio/high_mean": 3.6337880828796187e-06,
+      "clip_ratio/low_mean": 4.3961883989140915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7595671958333696e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15547.0,
+      "completions/mean_length": 4983.2890625,
+      "completions/mean_terminated_length": 4709.67236328125,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.825260303914547,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004848882555961609,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 176932549.0,
+      "reward": 0.6484375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616146087646,
+      "sampling/importance_sampling_ratio/min": 1.626804078114219e-05,
+      "sampling/sampling_logp_difference/max": 11.026308059692383,
+      "sampling/sampling_logp_difference/mean": 0.017959970980882645,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.1141860795760294e-05,
+      "clip_ratio/high_mean": 2.7854651989400736e-06,
+      "clip_ratio/low_mean": 4.2418692146384274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5204157913758536e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 5766.5234375,
+      "completions/mean_terminated_length": 5511.7041015625,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.9016259610652924,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004749474115669727,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 177691752.0,
+      "reward": 0.5,
+      "reward_std": 0.2738044261932373,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000141859054565,
+      "sampling/importance_sampling_ratio/min": 8.927558155846782e-06,
+      "sampling/sampling_logp_difference/max": 11.626367568969727,
+      "sampling/sampling_logp_difference/mean": 0.019118282943964005,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 5.5243735914700665e-06,
+      "clip_ratio/high_mean": 2.1587275114143267e-06,
+      "clip_ratio/low_mean": 4.609663824339805e-05,
+      "clip_ratio/low_min": 3.983555870945565e-06,
+      "clip_ratio/region_mean": 4.8255366664307076e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15696.0,
+      "completions/mean_length": 6993.671875,
+      "completions/mean_terminated_length": 6768.30419921875,
+      "completions/min_length": 889.0,
+      "completions/min_terminated_length": 889.0,
+      "entropy": 0.9074988812208176,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004418120253831148,
+      "learning_rate": 1e-05,
+      "loss": 0.1135,
+      "num_tokens": 178603454.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000037670135498,
+      "sampling/importance_sampling_ratio/min": 0.0018135923892259598,
+      "sampling/sampling_logp_difference/max": 6.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01957814022898674,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 5.126943051436683e-06,
+      "clip_ratio/high_mean": 1.2817357628591708e-06,
+      "clip_ratio/low_mean": 2.7488794444252562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.877053032079857e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 7445.1328125,
+      "completions/mean_terminated_length": 6849.20849609375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9255013465881348,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00237120408564806,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 179577063.0,
+      "reward": 0.40625,
+      "reward_std": 0.21040897071361542,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725818634033,
+      "sampling/importance_sampling_ratio/min": 9.651589061832055e-05,
+      "sampling/sampling_logp_difference/max": 9.245802879333496,
+      "sampling/sampling_logp_difference/mean": 0.02165937051177025,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.8956294752570102e-05,
+      "clip_ratio/high_mean": 4.7390736881425255e-06,
+      "clip_ratio/low_mean": 2.6486316301088664e-05,
+      "clip_ratio/low_min": 3.516273409331916e-06,
+      "clip_ratio/region_mean": 3.122539010291803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6120.5546875,
+      "completions/mean_terminated_length": 5703.34130859375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8181199952960014,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004715202376246452,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 180380422.0,
+      "reward": 0.5,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999874472618103,
+      "sampling/importance_sampling_ratio/min": 0.004350374918431044,
+      "sampling/sampling_logp_difference/max": 5.437493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018377620726823807,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 5.594843969447538e-06,
+      "clip_ratio/high_mean": 2.376495558564784e-06,
+      "clip_ratio/low_mean": 3.4097628713425365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6474124044616474e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 6351.203125,
+      "completions/mean_terminated_length": 5857.78662109375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.8798654451966286,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003063712501898408,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 181212776.0,
+      "reward": 0.453125,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999946355819702,
+      "sampling/importance_sampling_ratio/min": 7.891544555604924e-06,
+      "sampling/sampling_logp_difference/max": 11.74971866607666,
+      "sampling/sampling_logp_difference/mean": 0.019523698836565018,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.544438988001275e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.544438988001275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14180.0,
+      "completions/mean_length": 6330.046875,
+      "completions/mean_terminated_length": 6170.46044921875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.8319354206323624,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033194730058312416,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 182041910.0,
+      "reward": 0.453125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998994469642639,
+      "sampling/importance_sampling_ratio/min": 0.00010535263572819531,
+      "sampling/sampling_logp_difference/max": 9.158197402954102,
+      "sampling/sampling_logp_difference/mean": 0.018981872126460075,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7156292415165808e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7156292415165808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15982.0,
+      "completions/mean_length": 6665.2890625,
+      "completions/mean_terminated_length": 6351.7822265625,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9336326420307159,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004492956213653088,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 182914843.0,
+      "reward": 0.3828125,
+      "reward_std": 0.14807432889938354,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 0.011399568989872932,
+      "sampling/sampling_logp_difference/max": 4.474179744720459,
+      "sampling/sampling_logp_difference/mean": 0.02088768407702446,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.2495465802639956e-05,
+      "clip_ratio/high_mean": 9.084843100026774e-06,
+      "clip_ratio/low_mean": 5.4809036328151706e-05,
+      "clip_ratio/low_min": 8.953898031904828e-06,
+      "clip_ratio/region_mean": 6.389387954186532e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16064.0,
+      "completions/mean_length": 5393.9140625,
+      "completions/mean_terminated_length": 5039.39501953125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.7864786610007286,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003816079581156373,
+      "learning_rate": 1e-05,
+      "loss": -0.004,
+      "num_tokens": 183628152.0,
+      "reward": 0.546875,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998779892921448,
+      "sampling/importance_sampling_ratio/min": 0.003246711567044258,
+      "sampling/sampling_logp_difference/max": 5.730112552642822,
+      "sampling/sampling_logp_difference/mean": 0.018448319286108017,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 8.638648068881594e-06,
+      "clip_ratio/high_mean": 2.1596620172203984e-06,
+      "clip_ratio/low_mean": 1.6896704778446292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9056366909353528e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 7161.5,
+      "completions/mean_terminated_length": 7015.111328125,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.915394201874733,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003666195785626769,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 184562352.0,
+      "reward": 0.3671875,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00025550799909979105,
+      "sampling/sampling_logp_difference/max": 8.272256851196289,
+      "sampling/sampling_logp_difference/mean": 0.019755780696868896,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 6.424931598303374e-06,
+      "clip_ratio/high_mean": 1.6062328995758435e-06,
+      "clip_ratio/low_mean": 2.49038239417132e-05,
+      "clip_ratio/low_min": 4.00025601265952e-06,
+      "clip_ratio/region_mean": 2.651005689813246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15408.0,
+      "completions/mean_length": 7957.671875,
+      "completions/mean_terminated_length": 7685.8544921875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "entropy": 1.1176252663135529,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025940234772861004,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 185606670.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999893844127655,
+      "sampling/importance_sampling_ratio/min": 0.0007622809498570859,
+      "sampling/sampling_logp_difference/max": 7.179195404052734,
+      "sampling/sampling_logp_difference/mean": 0.02338646724820137,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 1.9903963220713194e-05,
+      "clip_ratio/high_mean": 5.829163114867697e-06,
+      "clip_ratio/low_mean": 4.4742550926457625e-05,
+      "clip_ratio/low_min": 3.5803282116830815e-06,
+      "clip_ratio/region_mean": 5.057171370026481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 7060.6640625,
+      "completions/mean_terminated_length": 6759.9111328125,
+      "completions/min_length": 1460.0,
+      "completions/min_terminated_length": 1460.0,
+      "entropy": 0.9148540124297142,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004315398633480072,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 186526883.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0004585353017318994,
+      "sampling/sampling_logp_difference/max": 7.687473297119141,
+      "sampling/sampling_logp_difference/mean": 0.01967843994498253,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 1.147099328591139e-05,
+      "clip_ratio/high_mean": 2.8677483214778476e-06,
+      "clip_ratio/low_mean": 2.8967988555450574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1835736763241584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15596.0,
+      "completions/mean_length": 6649.6640625,
+      "completions/mean_terminated_length": 6416.04052734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9298559054732323,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030786178540438414,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 187397536.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000005841255188,
+      "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07,
+      "sampling/sampling_logp_difference/max": 14.929608345031738,
+      "sampling/sampling_logp_difference/mean": 0.020215414464473724,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 2.2768570943298982e-05,
+      "clip_ratio/high_mean": 5.692142735824746e-06,
+      "clip_ratio/low_mean": 3.249637484259438e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8188517464732286e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 8292.015625,
+      "completions/mean_terminated_length": 7823.8837890625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.8232023045420647,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002438523108139634,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 188477778.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.005636279005557299,
+      "sampling/sampling_logp_difference/max": 5.178531169891357,
+      "sampling/sampling_logp_difference/mean": 0.018984414637088776,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 2.0840709566982696e-05,
+      "clip_ratio/high_mean": 6.135253556749376e-06,
+      "clip_ratio/low_mean": 2.255633432923787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.869158777230041e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15991.0,
+      "completions/mean_length": 7600.9765625,
+      "completions/mean_terminated_length": 6936.71484375,
+      "completions/min_length": 995.0,
+      "completions/min_terminated_length": 995.0,
+      "entropy": 0.8689917623996735,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004773247055709362,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 189470655.0,
+      "reward": 0.40625,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.001327168894931674,
+      "sampling/sampling_logp_difference/max": 6.624707221984863,
+      "sampling/sampling_logp_difference/mean": 0.018666012212634087,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 9.837458947004052e-06,
+      "clip_ratio/high_mean": 2.459364736751013e-06,
+      "clip_ratio/low_mean": 6.463955219260242e-05,
+      "clip_ratio/low_min": 1.0895145351241808e-05,
+      "clip_ratio/region_mean": 6.70989177251613e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16215.0,
+      "completions/mean_length": 7600.34375,
+      "completions/mean_terminated_length": 6855.96630859375,
+      "completions/min_length": 1335.0,
+      "completions/min_terminated_length": 1335.0,
+      "entropy": 0.7636929750442505,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004298723768442869,
+      "learning_rate": 1e-05,
+      "loss": 0.145,
+      "num_tokens": 190462227.0,
+      "reward": 0.515625,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999310374259949,
+      "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05,
+      "sampling/sampling_logp_difference/max": 9.996363639831543,
+      "sampling/sampling_logp_difference/mean": 0.018035393208265305,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 1.4060602325116633e-05,
+      "clip_ratio/high_mean": 3.5151505812791584e-06,
+      "clip_ratio/low_mean": 2.6516039497437305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.003119024924672e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15151.0,
+      "completions/mean_length": 6512.0,
+      "completions/mean_terminated_length": 6434.267578125,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.9043584689497948,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006741553544998169,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 191312483.0,
+      "reward": 0.484375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 1.778468504198827e-05,
+      "sampling/sampling_logp_difference/max": 10.937172889709473,
+      "sampling/sampling_logp_difference/mean": 0.020878732204437256,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 1.7356085209030425e-05,
+      "clip_ratio/high_mean": 4.339021302257606e-06,
+      "clip_ratio/low_mean": 2.8831826739406097e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.317084781429003e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7178.6875,
+      "completions/mean_terminated_length": 6565.00048828125,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.8899475410580635,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00281486171297729,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 192251235.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2240736484527588,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999714493751526,
+      "sampling/importance_sampling_ratio/min": 9.012543159769848e-05,
+      "sampling/sampling_logp_difference/max": 9.314308166503906,
+      "sampling/sampling_logp_difference/mean": 0.020196784287691116,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.5558084214717383e-05,
+      "clip_ratio/high_mean": 3.889521053679346e-06,
+      "clip_ratio/low_mean": 3.0248688972278615e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.413820991227112e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15501.0,
+      "completions/max_terminated_length": 15501.0,
+      "completions/mean_length": 6602.5625,
+      "completions/mean_terminated_length": 6602.5625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.9266818463802338,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005070593673735857,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193116763.0,
+      "reward": 0.53125,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999746680259705,
+      "sampling/importance_sampling_ratio/min": 2.726537559283315e-06,
+      "sampling/sampling_logp_difference/max": 12.812478065490723,
+      "sampling/sampling_logp_difference/mean": 0.020026464015245438,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.188727416476468e-06,
+      "clip_ratio/high_mean": 1.047181854119117e-06,
+      "clip_ratio/low_mean": 2.959152834591805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.063871008635033e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6818.8828125,
+      "completions/mean_terminated_length": 6430.056640625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.874519519507885,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006362155079841614,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 194007868.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009298324585,
+      "sampling/importance_sampling_ratio/min": 0.0005216691642999649,
+      "sampling/sampling_logp_difference/max": 7.55847692489624,
+      "sampling/sampling_logp_difference/mean": 0.01943325623869896,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 9.645911177358357e-06,
+      "clip_ratio/high_mean": 2.4114777943395893e-06,
+      "clip_ratio/low_mean": 6.821557258263056e-05,
+      "clip_ratio/low_min": 1.7265090718865395e-05,
+      "clip_ratio/region_mean": 7.062705049065698e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14536.0,
+      "completions/mean_length": 5515.625,
+      "completions/mean_terminated_length": 5343.111328125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0683523043990135,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003797185141593218,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "num_tokens": 194735980.0,
+      "reward": 0.421875,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 1.137102216830499e-07,
+      "sampling/sampling_logp_difference/max": 15.989612579345703,
+      "sampling/sampling_logp_difference/mean": 0.02120930328965187,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.1971412252241862e-05,
+      "clip_ratio/high_mean": 5.4928530630604655e-06,
+      "clip_ratio/low_mean": 4.9151800567415194e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4644653801005916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 5853.546875,
+      "completions/mean_terminated_length": 5770.6298828125,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.7975900694727898,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004124365746974945,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 195504882.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000672340393066,
+      "sampling/importance_sampling_ratio/min": 0.0032877910416573286,
+      "sampling/sampling_logp_difference/max": 5.717539310455322,
+      "sampling/sampling_logp_difference/mean": 0.017819223925471306,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 7.066538728395244e-06,
+      "clip_ratio/high_mean": 2.843255515472265e-06,
+      "clip_ratio/low_mean": 5.1467116236381116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.431037175185338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6686.25,
+      "completions/mean_terminated_length": 6532.31787109375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.9018580466508865,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024995009880512953,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 196379306.0,
+      "reward": 0.421875,
+      "reward_std": 0.35824593901634216,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999300837516785,
+      "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05,
+      "sampling/sampling_logp_difference/max": 10.818918228149414,
+      "sampling/sampling_logp_difference/mean": 0.018989525735378265,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 6.652828687947476e-06,
+      "clip_ratio/high_mean": 2.5722979444253724e-06,
+      "clip_ratio/low_mean": 3.699686294567073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95691608900961e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16347.0,
+      "completions/mean_length": 7487.3359375,
+      "completions/mean_terminated_length": 7200.3466796875,
+      "completions/min_length": 1222.0,
+      "completions/min_terminated_length": 1222.0,
+      "entropy": 0.9890001565217972,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004295211285352707,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 197357397.0,
+      "reward": 0.40625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0006548459641635418,
+      "sampling/sampling_logp_difference/max": 7.33111047744751,
+      "sampling/sampling_logp_difference/mean": 0.02209121733903885,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 6.0850939007650595e-06,
+      "clip_ratio/high_mean": 1.5212734751912649e-06,
+      "clip_ratio/low_mean": 2.9443070673096372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0964344205131056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7233.484375,
+      "completions/mean_terminated_length": 6938.30615234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.9683803990483284,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003119673579931259,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 198303795.0,
+      "reward": 0.328125,
+      "reward_std": 0.23014704883098602,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000243186950684,
+      "sampling/importance_sampling_ratio/min": 0.020358745008707047,
+      "sampling/sampling_logp_difference/max": 3.89424467086792,
+      "sampling/sampling_logp_difference/mean": 0.021085180342197418,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.963812095113099e-06,
+      "clip_ratio/high_mean": 1.9909530237782747e-06,
+      "clip_ratio/low_mean": 4.031422963635123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.23051826601295e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15733.0,
+      "completions/mean_length": 6457.78125,
+      "completions/mean_terminated_length": 6300.22265625,
+      "completions/min_length": 850.0,
+      "completions/min_terminated_length": 850.0,
+      "entropy": 0.8881053999066353,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033790848683565855,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 199154735.0,
+      "reward": 0.3828125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998799562454224,
+      "sampling/importance_sampling_ratio/min": 2.872048128210736e-07,
+      "sampling/sampling_logp_difference/max": 15.063070297241211,
+      "sampling/sampling_logp_difference/mean": 0.01950821653008461,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 9.059622016138746e-06,
+      "clip_ratio/high_mean": 3.3430123380639998e-06,
+      "clip_ratio/low_mean": 2.2856192117615137e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6199204512522556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 7904.40625,
+      "completions/mean_terminated_length": 7769.81005859375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9881557524204254,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021492803934961557,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 200185643.0,
+      "reward": 0.359375,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001094341278076,
+      "sampling/importance_sampling_ratio/min": 0.001458622980862856,
+      "sampling/sampling_logp_difference/max": 6.530262470245361,
+      "sampling/sampling_logp_difference/mean": 0.021201875060796738,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 6.9962839006620925e-06,
+      "clip_ratio/high_mean": 1.7490709751655231e-06,
+      "clip_ratio/low_mean": 3.018811844412994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193718976035598e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 7414.4921875,
+      "completions/mean_terminated_length": 7414.4921875,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "entropy": 0.9571134969592094,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037221095990389585,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 201153114.0,
+      "reward": 0.4375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999958872795105,
+      "sampling/importance_sampling_ratio/min": 0.0009130563121289015,
+      "sampling/sampling_logp_difference/max": 6.99871301651001,
+      "sampling/sampling_logp_difference/mean": 0.021356744691729546,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 1.1248092050664127e-05,
+      "clip_ratio/high_mean": 2.8120230126660317e-06,
+      "clip_ratio/low_mean": 5.4354991334548686e-05,
+      "clip_ratio/low_min": 6.868132004456129e-06,
+      "clip_ratio/region_mean": 5.716701480196207e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15835.0,
+      "completions/max_terminated_length": 15835.0,
+      "completions/mean_length": 5955.953125,
+      "completions/mean_terminated_length": 5955.953125,
+      "completions/min_length": 1394.0,
+      "completions/min_terminated_length": 1394.0,
+      "entropy": 0.730999618768692,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006285305600613356,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 201933044.0,
+      "reward": 0.59375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.007535050623118877,
+      "sampling/sampling_logp_difference/max": 4.888189792633057,
+      "sampling/sampling_logp_difference/mean": 0.016975615173578262,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 7.226686648209579e-06,
+      "clip_ratio/high_mean": 3.094216481258627e-06,
+      "clip_ratio/low_mean": 4.66828214484849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.977703792974353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6923.3515625,
+      "completions/mean_terminated_length": 6458.0732421875,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9938417226076126,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005667983554303646,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 202837281.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05,
+      "sampling/sampling_logp_difference/max": 10.402952194213867,
+      "sampling/sampling_logp_difference/mean": 0.022059854120016098,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 5.2318769121484365e-06,
+      "clip_ratio/high_mean": 1.3079692280371091e-06,
+      "clip_ratio/low_mean": 4.239228087499214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3700250216716086e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14726.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 5930.9296875,
+      "completions/mean_terminated_length": 5930.9296875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.8100385963916779,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004052883945405483,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 203614448.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999989926815033,
+      "sampling/importance_sampling_ratio/min": 0.00015170808183029294,
+      "sampling/sampling_logp_difference/max": 8.79355239868164,
+      "sampling/sampling_logp_difference/mean": 0.018519222736358643,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 4.905230980511988e-06,
+      "clip_ratio/high_mean": 1.226307745127997e-06,
+      "clip_ratio/low_mean": 5.500513248080097e-05,
+      "clip_ratio/low_min": 7.924934834591113e-06,
+      "clip_ratio/region_mean": 5.6231440112242126e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14996.0,
+      "completions/mean_length": 6911.1015625,
+      "completions/mean_terminated_length": 6108.3134765625,
+      "completions/min_length": 862.0,
+      "completions/min_terminated_length": 862.0,
+      "entropy": 0.9260227829217911,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004494607914239168,
+      "learning_rate": 1e-05,
+      "loss": 0.0269,
+      "num_tokens": 204518261.0,
+      "reward": 0.4140625,
+      "reward_std": 0.34033796191215515,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998886585235596,
+      "sampling/importance_sampling_ratio/min": 0.0015266009140759706,
+      "sampling/sampling_logp_difference/max": 6.484711647033691,
+      "sampling/sampling_logp_difference/mean": 0.020527629181742668,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 8.293764039990492e-06,
+      "clip_ratio/high_mean": 2.073441009997623e-06,
+      "clip_ratio/low_mean": 4.75325257411896e-05,
+      "clip_ratio/low_min": 3.599504680096288e-06,
+      "clip_ratio/region_mean": 4.960596663750039e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14637.0,
+      "completions/mean_length": 6972.921875,
+      "completions/mean_terminated_length": 6823.5400390625,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 1.0095533654093742,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029451537411659956,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 205433843.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05,
+      "sampling/sampling_logp_difference/max": 10.53177547454834,
+      "sampling/sampling_logp_difference/mean": 0.02013089321553707,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 4.163383164268453e-05,
+      "clip_ratio/high_mean": 1.382379150527413e-05,
+      "clip_ratio/low_mean": 3.86000854177837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2423876240936806e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 6706.6640625,
+      "completions/mean_terminated_length": 6313.2763671875,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 0.8647518903017044,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003371767932549119,
+      "learning_rate": 1e-05,
+      "loss": 0.073,
+      "num_tokens": 206310296.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 2.948181463580113e-05,
+      "sampling/sampling_logp_difference/max": 10.431736946105957,
+      "sampling/sampling_logp_difference/mean": 0.019770190119743347,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4946740381892596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4946740381892596e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 6882.609375,
+      "completions/mean_terminated_length": 6415.32763671875,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "entropy": 1.013342760503292,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016336971893906593,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 207210974.0,
+      "reward": 0.359375,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999210834503174,
+      "sampling/importance_sampling_ratio/min": 0.0013267879839986563,
+      "sampling/sampling_logp_difference/max": 6.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.02139991894364357,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.4866403944324702e-05,
+      "clip_ratio/high_mean": 3.7166009860811755e-06,
+      "clip_ratio/low_mean": 3.938925010515959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.310585177336179e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15203.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 6195.7421875,
+      "completions/mean_terminated_length": 6195.7421875,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.8448907434940338,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005036406684666872,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208021893.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955892562866,
+      "sampling/importance_sampling_ratio/min": 0.0040348549373447895,
+      "sampling/sampling_logp_difference/max": 5.512784957885742,
+      "sampling/sampling_logp_difference/mean": 0.018679853528738022,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.1244883353356272e-05,
+      "clip_ratio/high_mean": 2.811220838339068e-06,
+      "clip_ratio/low_mean": 3.422392001084518e-05,
+      "clip_ratio/low_min": 6.451612989621935e-06,
+      "clip_ratio/region_mean": 3.703514119024476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6829.609375,
+      "completions/mean_terminated_length": 6521.40283203125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.8679579794406891,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029643685556948185,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 208912059.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00038063788088038564,
+      "sampling/sampling_logp_difference/max": 7.873661994934082,
+      "sampling/sampling_logp_difference/mean": 0.018488366156816483,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 2.2700600311509334e-05,
+      "clip_ratio/high_mean": 5.675150077877333e-06,
+      "clip_ratio/low_mean": 3.138338854569156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.705853873725573e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14503.0,
+      "completions/max_terminated_length": 14503.0,
+      "completions/mean_length": 5444.4453125,
+      "completions/mean_terminated_length": 5444.4453125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0460086688399315,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035942886024713516,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "num_tokens": 209627804.0,
+      "reward": 0.484375,
+      "reward_std": 0.338498055934906,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997478723526,
+      "sampling/importance_sampling_ratio/min": 0.03179635480046272,
+      "sampling/sampling_logp_difference/max": 3.4484035968780518,
+      "sampling/sampling_logp_difference/mean": 0.020146891474723816,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.477029400120955e-05,
+      "clip_ratio/high_mean": 4.552578502625693e-06,
+      "clip_ratio/low_mean": 5.265122354103369e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.720380158891203e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16244.0,
+      "completions/mean_length": 7657.390625,
+      "completions/mean_terminated_length": 7152.544921875,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 0.9528728649020195,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0044983453117311,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 210630150.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000007152557373,
+      "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05,
+      "sampling/sampling_logp_difference/max": 10.158285140991211,
+      "sampling/sampling_logp_difference/mean": 0.02131088823080063,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 8.607642712377128e-06,
+      "clip_ratio/high_mean": 2.151910678094282e-06,
+      "clip_ratio/low_mean": 2.2759413695894182e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.491132454451872e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7574.3515625,
+      "completions/mean_terminated_length": 7504.984375,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 1.0009776800870895,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006095650140196085,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 211620355.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 0.0013946897815912962,
+      "sampling/sampling_logp_difference/max": 6.575083255767822,
+      "sampling/sampling_logp_difference/mean": 0.021727774292230606,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.764823082339717e-05,
+      "clip_ratio/high_mean": 5.141430960975413e-06,
+      "clip_ratio/low_mean": 5.936152001595474e-05,
+      "clip_ratio/low_min": 9.155588486464694e-06,
+      "clip_ratio/region_mean": 6.450295177273802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14915.0,
+      "completions/mean_length": 7919.6875,
+      "completions/mean_terminated_length": 7716.54443359375,
+      "completions/min_length": 1517.0,
+      "completions/min_terminated_length": 1517.0,
+      "entropy": 1.0405654236674309,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037038614973425865,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 212654747.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381899833679,
+      "sampling/importance_sampling_ratio/min": 0.0057550109922885895,
+      "sampling/sampling_logp_difference/max": 5.157684326171875,
+      "sampling/sampling_logp_difference/mean": 0.022051017731428146,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.265254240934155e-05,
+      "clip_ratio/high_mean": 3.1631356023353874e-06,
+      "clip_ratio/low_mean": 4.716233138424286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.032546687289141e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 8613.4765625,
+      "completions/mean_terminated_length": 7735.0693359375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.890489287674427,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325607368722558,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 213774584.0,
+      "reward": 0.40625,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000060796737671,
+      "sampling/importance_sampling_ratio/min": 1.670176425250247e-05,
+      "sampling/sampling_logp_difference/max": 10.999996185302734,
+      "sampling/sampling_logp_difference/mean": 0.020002499222755432,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.6404605503339553e-05,
+      "clip_ratio/high_mean": 4.101151375834888e-06,
+      "clip_ratio/low_mean": 3.880500707964529e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2906158682853857e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7324.8984375,
+      "completions/mean_terminated_length": 6473.1884765625,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 0.761004202067852,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038265211042016745,
+      "learning_rate": 1e-05,
+      "loss": 0.0717,
+      "num_tokens": 214728371.0,
+      "reward": 0.515625,
+      "reward_std": 0.32719239592552185,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000168085098267,
+      "sampling/importance_sampling_ratio/min": 0.0003049026126973331,
+      "sampling/sampling_logp_difference/max": 8.095518112182617,
+      "sampling/sampling_logp_difference/mean": 0.018367979675531387,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 5.624549885396846e-06,
+      "clip_ratio/high_mean": 1.4061374713492114e-06,
+      "clip_ratio/low_mean": 3.6433707123251224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7839844594600436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14167.0,
+      "completions/max_terminated_length": 14167.0,
+      "completions/mean_length": 6422.0859375,
+      "completions/mean_terminated_length": 6422.0859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.9946094751358032,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002729539293795824,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 215570806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.026308411732316017,
+      "sampling/sampling_logp_difference/max": 3.637866497039795,
+      "sampling/sampling_logp_difference/mean": 0.021903935819864273,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 7.2379848461423535e-06,
+      "clip_ratio/high_mean": 1.8094962115355884e-06,
+      "clip_ratio/low_mean": 3.17277934982485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353728982347093e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15585.0,
+      "completions/mean_length": 6845.2890625,
+      "completions/mean_terminated_length": 6693.88134765625,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8822609707713127,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004974282346665859,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 216465635.0,
+      "reward": 0.5390625,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 8.749838889343664e-05,
+      "sampling/sampling_logp_difference/max": 9.343890190124512,
+      "sampling/sampling_logp_difference/mean": 0.019389234483242035,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.58592818024772e-05,
+      "clip_ratio/high_mean": 3.9648204506193e-06,
+      "clip_ratio/low_mean": 4.096964960353944e-05,
+      "clip_ratio/low_min": 1.7403560605089297e-05,
+      "clip_ratio/region_mean": 4.49344687467601e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 7805.484375,
+      "completions/mean_terminated_length": 7528.7578125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.9977599084377289,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0033159854356199503,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 217485089.0,
+      "reward": 0.421875,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999412298202515,
+      "sampling/importance_sampling_ratio/min": 7.967943383846432e-05,
+      "sampling/sampling_logp_difference/max": 9.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.021925684064626694,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.8265397557115648e-05,
+      "clip_ratio/high_mean": 4.566349389278912e-06,
+      "clip_ratio/low_mean": 4.044636898470344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5012717691861326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15681.0,
+      "completions/mean_length": 7737.5546875,
+      "completions/mean_terminated_length": 7530.04052734375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.8667014688253403,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034952745772898197,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "num_tokens": 218496040.0,
+      "reward": 0.453125,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999128580093384,
+      "sampling/importance_sampling_ratio/min": 6.726370338583365e-05,
+      "sampling/sampling_logp_difference/max": 9.606889724731445,
+      "sampling/sampling_logp_difference/mean": 0.019742710515856743,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 8.244294804171659e-06,
+      "clip_ratio/high_mean": 2.0610737010429148e-06,
+      "clip_ratio/low_mean": 3.204250072030845e-05,
+      "clip_ratio/low_min": 3.323495775475749e-06,
+      "clip_ratio/region_mean": 3.410357436450795e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15858.0,
+      "completions/mean_length": 7365.84375,
+      "completions/mean_terminated_length": 6601.59326171875,
+      "completions/min_length": 744.0,
+      "completions/min_terminated_length": 744.0,
+      "entropy": 0.8151945173740387,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038676802068948746,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 219459140.0,
+      "reward": 0.46875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00023387260443996638,
+      "sampling/sampling_logp_difference/max": 8.360733985900879,
+      "sampling/sampling_logp_difference/mean": 0.018882082775235176,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 6.87833608026267e-06,
+      "clip_ratio/high_mean": 2.9462287329806713e-06,
+      "clip_ratio/low_mean": 5.435333650893881e-05,
+      "clip_ratio/low_min": 5.33937054569833e-06,
+      "clip_ratio/region_mean": 5.729956546929316e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 6448.0078125,
+      "completions/mean_terminated_length": 6369.771484375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9546648040413857,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004310046322643757,
+      "learning_rate": 1e-05,
+      "loss": 0.1082,
+      "num_tokens": 220304605.0,
+      "reward": 0.5703125,
+      "reward_std": 0.35611939430236816,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999396800994873,
+      "sampling/importance_sampling_ratio/min": 0.0001234127557836473,
+      "sampling/sampling_logp_difference/max": 8.99997615814209,
+      "sampling/sampling_logp_difference/mean": 0.020253397524356842,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 6.196094091137638e-06,
+      "clip_ratio/high_mean": 1.5490235227844096e-06,
+      "clip_ratio/low_mean": 2.5416685957679874e-05,
+      "clip_ratio/low_min": 5.5736391004757024e-06,
+      "clip_ratio/region_mean": 2.696570959415112e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 7457.6484375,
+      "completions/mean_terminated_length": 6941.24755859375,
+      "completions/min_length": 604.0,
+      "completions/min_terminated_length": 604.0,
+      "entropy": 0.8182889074087143,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026646999176591635,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 221281968.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2012200653553009,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173283576965,
+      "sampling/importance_sampling_ratio/min": 2.902353571698768e-06,
+      "sampling/sampling_logp_difference/max": 12.749988555908203,
+      "sampling/sampling_logp_difference/mean": 0.019208962097764015,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 1.6189535017474554e-05,
+      "clip_ratio/high_mean": 4.047383754368639e-06,
+      "clip_ratio/low_mean": 3.127787306311802e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.532525670379982e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 8561.109375,
+      "completions/mean_terminated_length": 7969.79052734375,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.9581378549337387,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016026750672608614,
+      "learning_rate": 1e-05,
+      "loss": 0.0131,
+      "num_tokens": 222399046.0,
+      "reward": 0.34375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 1.653693971093162e-06,
+      "sampling/sampling_logp_difference/max": 13.312499046325684,
+      "sampling/sampling_logp_difference/mean": 0.02173236384987831,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 1.4200771602190798e-05,
+      "clip_ratio/high_mean": 4.3255887476334465e-06,
+      "clip_ratio/low_mean": 5.2955770115659107e-05,
+      "clip_ratio/low_min": 3.402656830076012e-06,
+      "clip_ratio/region_mean": 5.7281358749605715e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16239.0,
+      "completions/mean_length": 7152.34375,
+      "completions/mean_terminated_length": 7079.6533203125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9052041247487068,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005460259038954973,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 223335010.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3356297016143799,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999966621398926,
+      "sampling/importance_sampling_ratio/min": 0.010161337442696095,
+      "sampling/sampling_logp_difference/max": 4.589165210723877,
+      "sampling/sampling_logp_difference/mean": 0.01986619457602501,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 1.4350314813782461e-05,
+      "clip_ratio/high_mean": 3.5875787034456152e-06,
+      "clip_ratio/low_mean": 3.81288905373367e-05,
+      "clip_ratio/low_min": 8.099272235995159e-06,
+      "clip_ratio/region_mean": 4.1716469809216505e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 6678.65625,
+      "completions/mean_terminated_length": 6524.603515625,
+      "completions/min_length": 963.0,
+      "completions/min_terminated_length": 963.0,
+      "entropy": 0.9043187350034714,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005933742038905621,
+      "learning_rate": 1e-05,
+      "loss": 0.0966,
+      "num_tokens": 224207006.0,
+      "reward": 0.484375,
+      "reward_std": 0.3316681981086731,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000031590461731,
+      "sampling/importance_sampling_ratio/min": 0.0011734943836927414,
+      "sampling/sampling_logp_difference/max": 6.747769355773926,
+      "sampling/sampling_logp_difference/mean": 0.019827336072921753,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 1.6498819377375185e-05,
+      "clip_ratio/high_mean": 4.124704844343796e-06,
+      "clip_ratio/low_mean": 3.601791678420341e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.014262168539062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6999.0390625,
+      "completions/mean_terminated_length": 6850.07177734375,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8109970837831497,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003635740838944912,
+      "learning_rate": 1e-05,
+      "loss": 0.104,
+      "num_tokens": 225122891.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999303817749023,
+      "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05,
+      "sampling/sampling_logp_difference/max": 10.987512588500977,
+      "sampling/sampling_logp_difference/mean": 0.018912551924586296,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 9.527577958579059e-06,
+      "clip_ratio/high_mean": 2.3818944896447647e-06,
+      "clip_ratio/low_mean": 3.766565987461945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.004755419373396e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15713.0,
+      "completions/mean_length": 7483.7109375,
+      "completions/mean_terminated_length": 7045.9912109375,
+      "completions/min_length": 1153.0,
+      "completions/min_terminated_length": 1153.0,
+      "entropy": 0.9473970532417297,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003405241761356592,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 226102462.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00002920627594,
+      "sampling/importance_sampling_ratio/min": 0.00525119062513113,
+      "sampling/sampling_logp_difference/max": 5.249300479888916,
+      "sampling/sampling_logp_difference/mean": 0.021076779812574387,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.5867321963014547e-05,
+      "clip_ratio/high_mean": 3.966830490753637e-06,
+      "clip_ratio/low_mean": 3.8259706570897833e-05,
+      "clip_ratio/low_min": 3.549019083948224e-06,
+      "clip_ratio/region_mean": 4.2226537743772496e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 7569.03125,
+      "completions/mean_terminated_length": 7357.47216796875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9231455475091934,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025927501264959574,
+      "learning_rate": 1e-05,
+      "loss": 0.0801,
+      "num_tokens": 227093562.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19097033143043518,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0052477638237178326,
+      "sampling/sampling_logp_difference/max": 5.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.020578444004058838,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.344091060673236e-05,
+      "clip_ratio/high_mean": 3.36022765168309e-06,
+      "clip_ratio/low_mean": 4.253613235505327e-05,
+      "clip_ratio/low_min": 3.5579084851633525e-06,
+      "clip_ratio/region_mean": 4.5896360120423196e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 7589.2734375,
+      "completions/mean_terminated_length": 7378.2001953125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9265239909291267,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030512227676808834,
+      "learning_rate": 1e-05,
+      "loss": 0.04,
+      "num_tokens": 228086405.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0002165911573683843,
+      "sampling/sampling_logp_difference/max": 8.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.020208362489938736,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.9613525410022703e-05,
+      "clip_ratio/high_mean": 4.903381352505676e-06,
+      "clip_ratio/low_mean": 3.184792547017423e-05,
+      "clip_ratio/low_min": 7.29296516510658e-06,
+      "clip_ratio/region_mean": 3.675130722058384e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 8420.6875,
+      "completions/mean_terminated_length": 8096.97509765625,
+      "completions/min_length": 1114.0,
+      "completions/min_terminated_length": 1114.0,
+      "entropy": 0.9572964608669281,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022430522367358208,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 229183765.0,
+      "reward": 0.34375,
+      "reward_std": 0.309583842754364,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 0.00029693738906644285,
+      "sampling/sampling_logp_difference/max": 8.121989250183105,
+      "sampling/sampling_logp_difference/mean": 0.021570362150669098,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.728750577167375e-06,
+      "clip_ratio/high_mean": 1.6821876442918438e-06,
+      "clip_ratio/low_mean": 2.1682553096979973e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.336474062758498e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15736.0,
+      "completions/mean_length": 6809.765625,
+      "completions/mean_terminated_length": 6579.984375,
+      "completions/min_length": 860.0,
+      "completions/min_terminated_length": 860.0,
+      "entropy": 0.884086549282074,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004295065999031067,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 230077607.0,
+      "reward": 0.484375,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00754612497985363,
+      "sampling/sampling_logp_difference/max": 4.886721134185791,
+      "sampling/sampling_logp_difference/mean": 0.019895706325769424,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 2.8609347509700456e-05,
+      "clip_ratio/high_mean": 7.152336877425114e-06,
+      "clip_ratio/low_mean": 5.158006410965754e-05,
+      "clip_ratio/low_min": 5.210069957684027e-06,
+      "clip_ratio/region_mean": 5.873240070286556e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15080.0,
+      "completions/mean_length": 7340.6953125,
+      "completions/mean_terminated_length": 6973.0810546875,
+      "completions/min_length": 1616.0,
+      "completions/min_terminated_length": 1616.0,
+      "entropy": 0.9920620769262314,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004631794057786465,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 231035616.0,
+      "reward": 0.4375,
+      "reward_std": 0.3235401213169098,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337792396545,
+      "sampling/importance_sampling_ratio/min": 0.0002508950710762292,
+      "sampling/sampling_logp_difference/max": 8.290475845336914,
+      "sampling/sampling_logp_difference/mean": 0.020591016858816147,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.3085940774290066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3085940774290066e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14120.0,
+      "completions/mean_length": 6748.875,
+      "completions/mean_terminated_length": 6595.93701171875,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.9867061004042625,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035752104595303535,
+      "learning_rate": 1e-05,
+      "loss": 0.0455,
+      "num_tokens": 231920056.0,
+      "reward": 0.40625,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999653100967407,
+      "sampling/importance_sampling_ratio/min": 0.0003869794018100947,
+      "sampling/sampling_logp_difference/max": 7.8571391105651855,
+      "sampling/sampling_logp_difference/mean": 0.02061416581273079,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 1.2506750408647349e-05,
+      "clip_ratio/high_mean": 3.1266876021618373e-06,
+      "clip_ratio/low_mean": 3.10397430212106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.416643085074611e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 7260.3046875,
+      "completions/mean_terminated_length": 7188.46435546875,
+      "completions/min_length": 1384.0,
+      "completions/min_terminated_length": 1384.0,
+      "entropy": 1.0388494208455086,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036644963547587395,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 232869159.0,
+      "reward": 0.390625,
+      "reward_std": 0.2359209954738617,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999546408653259,
+      "sampling/importance_sampling_ratio/min": 0.0008660226594656706,
+      "sampling/sampling_logp_difference/max": 7.051599502563477,
+      "sampling/sampling_logp_difference/mean": 0.02120530977845192,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.704355301830219e-05,
+      "clip_ratio/high_mean": 6.760888254575548e-06,
+      "clip_ratio/low_mean": 3.1861192269388994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.862208097871189e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16073.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 6354.4609375,
+      "completions/mean_terminated_length": 6354.4609375,
+      "completions/min_length": 1035.0,
+      "completions/min_terminated_length": 1035.0,
+      "entropy": 0.8405331820249557,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004709267523139715,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 233702842.0,
+      "reward": 0.546875,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 0.0046309432946145535,
+      "sampling/sampling_logp_difference/max": 5.37499475479126,
+      "sampling/sampling_logp_difference/mean": 0.019126038998365402,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 9.749228638611385e-06,
+      "clip_ratio/high_mean": 2.437307159652846e-06,
+      "clip_ratio/low_mean": 3.855073941849696e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.098804652130639e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16026.0,
+      "completions/mean_length": 6514.578125,
+      "completions/mean_terminated_length": 6357.9208984375,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "entropy": 1.0254098922014236,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003066045930609107,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 234556348.0,
+      "reward": 0.4375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805092811584,
+      "sampling/importance_sampling_ratio/min": 0.005210204049944878,
+      "sampling/sampling_logp_difference/max": 5.257136344909668,
+      "sampling/sampling_logp_difference/mean": 0.019960148259997368,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.0475813724042382e-05,
+      "clip_ratio/high_mean": 2.6189534310105955e-06,
+      "clip_ratio/low_mean": 3.487835761006863e-05,
+      "clip_ratio/low_min": 2.9392399483185727e-06,
+      "clip_ratio/region_mean": 3.749731081370555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 7379.5546875,
+      "completions/mean_terminated_length": 7236.62744140625,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 1.0397320613265038,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005132520105689764,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 235521091.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519364118576,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999256134033203,
+      "sampling/importance_sampling_ratio/min": 0.00016659013635944575,
+      "sampling/sampling_logp_difference/max": 8.699974060058594,
+      "sampling/sampling_logp_difference/mean": 0.021417103707790375,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.9904123973901733e-05,
+      "clip_ratio/high_mean": 5.776861314643611e-06,
+      "clip_ratio/low_mean": 2.6659268655748747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2436129686175263e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14565.0,
+      "completions/mean_length": 7837.1640625,
+      "completions/mean_terminated_length": 7632.04052734375,
+      "completions/min_length": 1346.0,
+      "completions/min_terminated_length": 1346.0,
+      "entropy": 0.8400963917374611,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028969801496714354,
+      "learning_rate": 1e-05,
+      "loss": 0.0143,
+      "num_tokens": 236544160.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887943267822,
+      "sampling/importance_sampling_ratio/min": 2.883308241052873e-07,
+      "sampling/sampling_logp_difference/max": 15.059157371520996,
+      "sampling/sampling_logp_difference/mean": 0.019267702475190163,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 8.562770290154731e-06,
+      "clip_ratio/high_mean": 2.1406925725386827e-06,
+      "clip_ratio/low_mean": 4.060094340729847e-05,
+      "clip_ratio/low_min": 3.8700886761944275e-06,
+      "clip_ratio/region_mean": 4.2741635979837156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15350.0,
+      "completions/mean_length": 6696.3515625,
+      "completions/mean_terminated_length": 6542.57958984375,
+      "completions/min_length": 1239.0,
+      "completions/min_terminated_length": 1239.0,
+      "entropy": 0.8495818004012108,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003412836929783225,
+      "learning_rate": 1e-05,
+      "loss": 0.0803,
+      "num_tokens": 237423101.0,
+      "reward": 0.515625,
+      "reward_std": 0.37981897592544556,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.012152798473834991,
+      "sampling/sampling_logp_difference/max": 4.410195827484131,
+      "sampling/sampling_logp_difference/mean": 0.018458625301718712,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 1.1463653436294408e-05,
+      "clip_ratio/high_mean": 3.646129641765583e-06,
+      "clip_ratio/low_mean": 6.144847083078275e-05,
+      "clip_ratio/low_min": 1.110105540647055e-05,
+      "clip_ratio/region_mean": 6.509460160941671e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15666.0,
+      "completions/mean_length": 7700.3671875,
+      "completions/mean_terminated_length": 7121.45849609375,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "entropy": 0.8258870914578438,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024443145375698805,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 238429956.0,
+      "reward": 0.375,
+      "reward_std": 0.2872493863105774,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999113082885742,
+      "sampling/importance_sampling_ratio/min": 0.00026112530031241477,
+      "sampling/sampling_logp_difference/max": 8.250510215759277,
+      "sampling/sampling_logp_difference/mean": 0.019427984952926636,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 4.218127742205979e-06,
+      "clip_ratio/high_mean": 1.0545319355514948e-06,
+      "clip_ratio/low_mean": 1.7289162997258245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.834369493280974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16112.0,
+      "completions/mean_length": 6255.21875,
+      "completions/mean_terminated_length": 6094.44482421875,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.8179014846682549,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022747826296836138,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 239250160.0,
+      "reward": 0.5234375,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.0002633975527714938,
+      "sampling/sampling_logp_difference/max": 8.241846084594727,
+      "sampling/sampling_logp_difference/mean": 0.018723051995038986,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 1.698448841125355e-05,
+      "clip_ratio/high_mean": 5.369374321162468e-06,
+      "clip_ratio/low_mean": 6.14647315160255e-05,
+      "clip_ratio/low_min": 5.043576493335422e-06,
+      "clip_ratio/region_mean": 6.683410583718796e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15321.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6914.9609375,
+      "completions/mean_terminated_length": 6914.9609375,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9700981751084328,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005685295443981886,
+      "learning_rate": 1e-05,
+      "loss": -0.0056,
+      "num_tokens": 240156211.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998887777328491,
+      "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05,
+      "sampling/sampling_logp_difference/max": 9.997581481933594,
+      "sampling/sampling_logp_difference/mean": 0.021195171400904655,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9186837764427764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9186837764427764e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15469.0,
+      "completions/mean_length": 5227.53125,
+      "completions/mean_terminated_length": 5139.68505859375,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.9116031974554062,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003880272386595607,
+      "learning_rate": 1e-05,
+      "loss": 0.1246,
+      "num_tokens": 240845295.0,
+      "reward": 0.6328125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000362396240234,
+      "sampling/importance_sampling_ratio/min": 0.00012422871077433228,
+      "sampling/sampling_logp_difference/max": 8.993386268615723,
+      "sampling/sampling_logp_difference/mean": 0.018801718950271606,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 2.5015486926349695e-05,
+      "clip_ratio/high_mean": 8.084949570275057e-06,
+      "clip_ratio/low_mean": 5.524710468307603e-05,
+      "clip_ratio/low_min": 3.776891389861703e-06,
+      "clip_ratio/region_mean": 6.333205465125502e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 8065.4765625,
+      "completions/mean_terminated_length": 7510.90869140625,
+      "completions/min_length": 1055.0,
+      "completions/min_terminated_length": 1055.0,
+      "entropy": 0.7446574792265892,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028986844699829817,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 241895676.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3474721610546112,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999842643737793,
+      "sampling/importance_sampling_ratio/min": 0.0017039099475368857,
+      "sampling/sampling_logp_difference/max": 6.3748297691345215,
+      "sampling/sampling_logp_difference/mean": 0.01853121444582939,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 9.486341014053323e-06,
+      "clip_ratio/high_mean": 2.371585253513331e-06,
+      "clip_ratio/low_mean": 2.896106741445692e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.133265261112683e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15534.0,
+      "completions/max_terminated_length": 15534.0,
+      "completions/mean_length": 6127.359375,
+      "completions/mean_terminated_length": 6127.359375,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.8569132760167122,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003845847910270095,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 242698258.0,
+      "reward": 0.53125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000942945480347,
+      "sampling/importance_sampling_ratio/min": 0.00043231461313553154,
+      "sampling/sampling_logp_difference/max": 7.746356964111328,
+      "sampling/sampling_logp_difference/mean": 0.01856958493590355,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 2.9848330086679198e-05,
+      "clip_ratio/high_mean": 7.4620825216697995e-06,
+      "clip_ratio/low_mean": 4.3558867673709756e-05,
+      "clip_ratio/low_min": 4.417741820361698e-06,
+      "clip_ratio/region_mean": 5.1020949285884853e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15192.0,
+      "completions/mean_length": 6600.1484375,
+      "completions/mean_terminated_length": 6365.33642578125,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.78924310952425,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003953634761273861,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 243560957.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.0006525487406179309,
+      "sampling/sampling_logp_difference/max": 7.334624767303467,
+      "sampling/sampling_logp_difference/mean": 0.018097909167408943,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 6.635561703660642e-06,
+      "clip_ratio/high_mean": 1.6588904259151604e-06,
+      "clip_ratio/low_mean": 2.737523408313791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9034124281679397e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7852.171875,
+      "completions/mean_terminated_length": 7852.171875,
+      "completions/min_length": 1276.0,
+      "completions/min_terminated_length": 1276.0,
+      "entropy": 1.0598893761634827,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00360781978815794,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 244585923.0,
+      "reward": 0.3125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05,
+      "sampling/sampling_logp_difference/max": 10.076086044311523,
+      "sampling/sampling_logp_difference/mean": 0.022330068051815033,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 3.1540168947685743e-06,
+      "clip_ratio/high_mean": 7.885042236921436e-07,
+      "clip_ratio/low_mean": 4.7973388973332476e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.876189268543385e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7972.2265625,
+      "completions/mean_terminated_length": 7700.87890625,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.933217465877533,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0027661293279379606,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 245628064.0,
+      "reward": 0.28125,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05,
+      "sampling/sampling_logp_difference/max": 10.366576194763184,
+      "sampling/sampling_logp_difference/mean": 0.021125148981809616,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.2965969062861404e-05,
+      "clip_ratio/high_mean": 3.241492265715351e-06,
+      "clip_ratio/low_mean": 4.6317693090713874e-05,
+      "clip_ratio/low_min": 3.820877282123547e-06,
+      "clip_ratio/region_mean": 4.955918507221213e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15744.0,
+      "completions/mean_length": 7135.6953125,
+      "completions/mean_terminated_length": 6913.736328125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 0.7786942347884178,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005680318456143141,
+      "learning_rate": 1e-05,
+      "loss": 0.0786,
+      "num_tokens": 246561329.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999462366104126,
+      "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05,
+      "sampling/sampling_logp_difference/max": 9.737424850463867,
+      "sampling/sampling_logp_difference/mean": 0.018504241481423378,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.22437145175536e-05,
+      "clip_ratio/low_min": 1.4025082009538892e-05,
+      "clip_ratio/region_mean": 4.22437145175536e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 6704.046875,
+      "completions/mean_terminated_length": 6627.82666015625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "entropy": 1.0435140281915665,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026402862276881933,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 247437415.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31276631355285645,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998904466629028,
+      "sampling/importance_sampling_ratio/min": 0.0007800163584761322,
+      "sampling/sampling_logp_difference/max": 7.156195640563965,
+      "sampling/sampling_logp_difference/mean": 0.02134273201227188,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 2.223430897174694e-05,
+      "clip_ratio/high_mean": 6.8746438159905665e-06,
+      "clip_ratio/low_mean": 4.7084630978133646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3959275192028144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 5892.5078125,
+      "completions/mean_terminated_length": 5725.9765625,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "entropy": 0.8004944771528244,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003993614576756954,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 248211112.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0024652592837810516,
+      "sampling/sampling_logp_difference/max": 6.005458354949951,
+      "sampling/sampling_logp_difference/mean": 0.01924925297498703,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 2.1833082200828358e-05,
+      "clip_ratio/high_mean": 5.458270550207089e-06,
+      "clip_ratio/low_mean": 3.415995615796419e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961822596920683e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 7812.140625,
+      "completions/mean_terminated_length": 7316.24755859375,
+      "completions/min_length": 1515.0,
+      "completions/min_terminated_length": 1515.0,
+      "entropy": 0.8841542899608612,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001573400106281042,
+      "learning_rate": 1e-05,
+      "loss": 0.0823,
+      "num_tokens": 249228106.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 0.001001527882181108,
+      "sampling/sampling_logp_difference/max": 6.906228542327881,
+      "sampling/sampling_logp_difference/mean": 0.01956877112388611,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 1.014439021673752e-05,
+      "clip_ratio/high_mean": 2.53609755418438e-06,
+      "clip_ratio/low_mean": 3.068193461785995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.321803217204433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 6372.953125,
+      "completions/mean_terminated_length": 6132.6884765625,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.8228401988744736,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021125099156051874,
+      "learning_rate": 1e-05,
+      "loss": 0.0438,
+      "num_tokens": 250063284.0,
+      "reward": 0.5,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05,
+      "sampling/sampling_logp_difference/max": 9.937475204467773,
+      "sampling/sampling_logp_difference/mean": 0.01943521574139595,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 7.023906164249638e-06,
+      "clip_ratio/high_mean": 1.7559765410624095e-06,
+      "clip_ratio/low_mean": 2.526416994896863e-05,
+      "clip_ratio/low_min": 6.7760895490209805e-06,
+      "clip_ratio/region_mean": 2.7020146660561295e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16270.0,
+      "completions/mean_length": 7817.8671875,
+      "completions/mean_terminated_length": 7396.58154296875,
+      "completions/min_length": 1568.0,
+      "completions/min_terminated_length": 1568.0,
+      "entropy": 0.9454319775104523,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022315154783427715,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 251085123.0,
+      "reward": 0.40625,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06,
+      "sampling/sampling_logp_difference/max": 12.760490417480469,
+      "sampling/sampling_logp_difference/mean": 0.021764669567346573,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 1.4797966287005693e-05,
+      "clip_ratio/high_mean": 3.699491571751423e-06,
+      "clip_ratio/low_mean": 4.36271948274225e-05,
+      "clip_ratio/low_min": 3.6957101201551268e-06,
+      "clip_ratio/region_mean": 4.732668639917392e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 7168.4921875,
+      "completions/mean_terminated_length": 6635.36328125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8433891162276268,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 252020906.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589920043945,
+      "sampling/importance_sampling_ratio/min": 0.0003851866349577904,
+      "sampling/sampling_logp_difference/max": 7.861782550811768,
+      "sampling/sampling_logp_difference/mean": 0.01929781585931778,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 1.996871560550062e-05,
+      "clip_ratio/high_mean": 6.089093403716106e-06,
+      "clip_ratio/low_mean": 4.2792244585143635e-05,
+      "clip_ratio/low_min": 1.0337215371691855e-05,
+      "clip_ratio/region_mean": 4.8881338216233416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7322.5078125,
+      "completions/mean_terminated_length": 6876.8603515625,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 0.9157031401991844,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036942458245903254,
+      "learning_rate": 1e-05,
+      "loss": 0.079,
+      "num_tokens": 252977435.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24275577068328857,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999804496765137,
+      "sampling/importance_sampling_ratio/min": 0.00029605376766994596,
+      "sampling/sampling_logp_difference/max": 8.124969482421875,
+      "sampling/sampling_logp_difference/mean": 0.0205365102738142,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.631919460327481e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.631919460327481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16078.0,
+      "completions/mean_length": 7025.484375,
+      "completions/mean_terminated_length": 6723.5966796875,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 1.1329731941223145,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034127074759453535,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 253896161.0,
+      "reward": 0.25,
+      "reward_std": 0.27722424268722534,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0005197672289796174,
+      "sampling/sampling_logp_difference/max": 7.562129497528076,
+      "sampling/sampling_logp_difference/mean": 0.023741140961647034,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 4.368643658381188e-06,
+      "clip_ratio/high_mean": 1.092160914595297e-06,
+      "clip_ratio/low_mean": 2.4661783299961826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5753944555617636e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13776.0,
+      "completions/mean_length": 5996.1796875,
+      "completions/mean_terminated_length": 5661.08837890625,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8773328885436058,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003959407564252615,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 254690264.0,
+      "reward": 0.53125,
+      "reward_std": 0.26645541191101074,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07,
+      "sampling/sampling_logp_difference/max": 15.73043155670166,
+      "sampling/sampling_logp_difference/mean": 0.018407585099339485,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.616483677935321e-05,
+      "clip_ratio/high_mean": 4.041209194838302e-06,
+      "clip_ratio/low_mean": 3.736187466074625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140308453770558e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16383.0,
+      "completions/mean_length": 7165.328125,
+      "completions/mean_terminated_length": 6867.951171875,
+      "completions/min_length": 1115.0,
+      "completions/min_terminated_length": 1115.0,
+      "entropy": 0.9502597972750664,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030910037457942963,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 255626394.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000731945037842,
+      "sampling/importance_sampling_ratio/min": 0.00022311302018351853,
+      "sampling/sampling_logp_difference/max": 8.407832145690918,
+      "sampling/sampling_logp_difference/mean": 0.020668907091021538,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.1702686606440693e-05,
+      "clip_ratio/high_mean": 2.9256716516101733e-06,
+      "clip_ratio/low_mean": 5.5247357522603124e-05,
+      "clip_ratio/low_min": 3.6811261452385224e-06,
+      "clip_ratio/region_mean": 5.8173028264718596e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15375.0,
+      "completions/mean_length": 8001.9296875,
+      "completions/mean_terminated_length": 7661.34912109375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "entropy": 0.8591345250606537,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037233952898532152,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 256673457.0,
+      "reward": 0.421875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999151229858398,
+      "sampling/importance_sampling_ratio/min": 0.0021876997780054808,
+      "sampling/sampling_logp_difference/max": 6.124904632568359,
+      "sampling/sampling_logp_difference/mean": 0.020540472120046616,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 3.721341136042611e-05,
+      "clip_ratio/high_mean": 1.2759249216287571e-05,
+      "clip_ratio/low_mean": 3.570647322703735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.846572301175911e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 6924.84375,
+      "completions/mean_terminated_length": 6697.82421875,
+      "completions/min_length": 803.0,
+      "completions/min_terminated_length": 803.0,
+      "entropy": 0.7969356626272202,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006054217461496592,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 257578501.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.007889713160693645,
+      "sampling/sampling_logp_difference/max": 4.842195510864258,
+      "sampling/sampling_logp_difference/mean": 0.019306108355522156,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.0211543894911301e-05,
+      "clip_ratio/high_mean": 2.5528859737278253e-06,
+      "clip_ratio/low_mean": 5.2388056587915344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4940942732173426e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14439.0,
+      "completions/mean_length": 6203.03125,
+      "completions/mean_terminated_length": 5958.6884765625,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "entropy": 0.8734413683414459,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004903806839138269,
+      "learning_rate": 1e-05,
+      "loss": 0.0689,
+      "num_tokens": 258392625.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999826550483704,
+      "sampling/importance_sampling_ratio/min": 0.00020370795391499996,
+      "sampling/sampling_logp_difference/max": 8.498823165893555,
+      "sampling/sampling_logp_difference/mean": 0.01909301057457924,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.5135058674786706e-05,
+      "clip_ratio/high_mean": 4.64845766146027e-06,
+      "clip_ratio/low_mean": 4.373456977191381e-05,
+      "clip_ratio/low_min": 3.670856358439778e-06,
+      "clip_ratio/region_mean": 4.8383026296505705e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 7982.5390625,
+      "completions/mean_terminated_length": 7641.01611328125,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0091779381036758,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033637424930930138,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 259435270.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999765753746033,
+      "sampling/importance_sampling_ratio/min": 0.0016514655435457826,
+      "sampling/sampling_logp_difference/max": 6.406092166900635,
+      "sampling/sampling_logp_difference/mean": 0.02182736061513424,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 2.3964702677403693e-05,
+      "clip_ratio/high_mean": 5.991175669350923e-06,
+      "clip_ratio/low_mean": 5.2442986770984135e-05,
+      "clip_ratio/low_min": 8.75736759553547e-06,
+      "clip_ratio/region_mean": 5.843416238349164e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6915.3125,
+      "completions/mean_terminated_length": 6688.064453125,
+      "completions/min_length": 778.0,
+      "completions/min_terminated_length": 778.0,
+      "entropy": 0.7964543774724007,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052203768864274025,
+      "learning_rate": 1e-05,
+      "loss": 0.144,
+      "num_tokens": 260337614.0,
+      "reward": 0.46875,
+      "reward_std": 0.37928223609924316,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 7.032832218101248e-05,
+      "sampling/sampling_logp_difference/max": 9.562335968017578,
+      "sampling/sampling_logp_difference/mean": 0.017896221950650215,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 4.458271632756805e-05,
+      "clip_ratio/high_mean": 1.1145679081892013e-05,
+      "clip_ratio/low_mean": 6.243192206056847e-05,
+      "clip_ratio/low_min": 1.2397775662975619e-05,
+      "clip_ratio/region_mean": 7.357759886872373e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7029.4375,
+      "completions/mean_terminated_length": 6880.95263671875,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "entropy": 0.8605096861720085,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005570738110691309,
+      "learning_rate": 1e-05,
+      "loss": 0.0984,
+      "num_tokens": 261254070.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3327290117740631,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999494552612305,
+      "sampling/importance_sampling_ratio/min": 0.0009070249507203698,
+      "sampling/sampling_logp_difference/max": 7.005340576171875,
+      "sampling/sampling_logp_difference/mean": 0.01905740052461624,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 3.390461233720998e-05,
+      "clip_ratio/high_mean": 1.1191766247975465e-05,
+      "clip_ratio/low_mean": 7.46641262594494e-05,
+      "clip_ratio/low_min": 5.041745680500753e-06,
+      "clip_ratio/region_mean": 8.585589102949598e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5858.84375,
+      "completions/mean_terminated_length": 5606.240234375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.8430554121732712,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004496110137552023,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 262024906.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294877052307,
+      "sampling/importance_sampling_ratio/min": 0.00040469475788995624,
+      "sampling/sampling_logp_difference/max": 7.812377452850342,
+      "sampling/sampling_logp_difference/mean": 0.019225869327783585,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 3.2563955301156966e-06,
+      "clip_ratio/high_mean": 8.140988825289242e-07,
+      "clip_ratio/low_mean": 3.7080020149460324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.789411886145899e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15976.0,
+      "completions/mean_length": 8337.328125,
+      "completions/mean_terminated_length": 7728.7568359375,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.901745393872261,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00348713924176991,
+      "learning_rate": 1e-05,
+      "loss": -0.0002,
+      "num_tokens": 263110844.0,
+      "reward": 0.296875,
+      "reward_std": 0.20805485546588898,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998900890350342,
+      "sampling/importance_sampling_ratio/min": 0.0022652465850114822,
+      "sampling/sampling_logp_difference/max": 6.090071678161621,
+      "sampling/sampling_logp_difference/mean": 0.02157524600625038,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 2.3739744847262045e-05,
+      "clip_ratio/high_mean": 5.934936211815511e-06,
+      "clip_ratio/low_mean": 2.823553325015382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.417046866616147e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 7084.7265625,
+      "completions/mean_terminated_length": 6381.42041015625,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8265534415841103,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003980033565312624,
+      "learning_rate": 1e-05,
+      "loss": 0.0551,
+      "num_tokens": 264036169.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673366546631,
+      "sampling/importance_sampling_ratio/min": 0.00012345099821686745,
+      "sampling/sampling_logp_difference/max": 8.999666213989258,
+      "sampling/sampling_logp_difference/mean": 0.018782664090394974,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 1.1745505617000163e-05,
+      "clip_ratio/high_mean": 3.771558226617344e-06,
+      "clip_ratio/low_mean": 6.913120819262986e-05,
+      "clip_ratio/low_min": 2.494283216947224e-05,
+      "clip_ratio/region_mean": 7.290276607818669e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6543.796875,
+      "completions/mean_terminated_length": 6543.796875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8899869695305824,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.006467343773692846,
+      "learning_rate": 1e-05,
+      "loss": 0.1139,
+      "num_tokens": 264892767.0,
+      "reward": 0.484375,
+      "reward_std": 0.3934885561466217,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000489950180054,
+      "sampling/importance_sampling_ratio/min": 9.891482477542013e-05,
+      "sampling/sampling_logp_difference/max": 9.221251487731934,
+      "sampling/sampling_logp_difference/mean": 0.02032080665230751,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.395576979732141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.395576979732141e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16307.0,
+      "completions/mean_length": 8483.390625,
+      "completions/mean_terminated_length": 7813.84765625,
+      "completions/min_length": 1342.0,
+      "completions/min_terminated_length": 1342.0,
+      "entropy": 0.9621479511260986,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003174177836626768,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 265995697.0,
+      "reward": 0.3359375,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.0005628522485494614,
+      "sampling/sampling_logp_difference/max": 7.4824934005737305,
+      "sampling/sampling_logp_difference/mean": 0.02145479805767536,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.2596524811669951e-05,
+      "clip_ratio/high_mean": 3.149131202917488e-06,
+      "clip_ratio/low_mean": 3.7911659774181317e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.106079018129094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14985.0,
+      "completions/mean_length": 7184.578125,
+      "completions/mean_terminated_length": 6963.79248046875,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.9993807673454285,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003356153378263116,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 266937707.0,
+      "reward": 0.3828125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000238418579102,
+      "sampling/importance_sampling_ratio/min": 0.0017036627978086472,
+      "sampling/sampling_logp_difference/max": 6.374974727630615,
+      "sampling/sampling_logp_difference/mean": 0.02204768732190132,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 1.9245163684900035e-05,
+      "clip_ratio/high_mean": 4.811290921225009e-06,
+      "clip_ratio/low_mean": 4.8845648166206956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.365693925796222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16216.0,
+      "completions/mean_length": 7029.2265625,
+      "completions/mean_terminated_length": 6727.45947265625,
+      "completions/min_length": 851.0,
+      "completions/min_terminated_length": 851.0,
+      "entropy": 0.9139953926205635,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006375293247401714,
+      "learning_rate": 1e-05,
+      "loss": 0.0519,
+      "num_tokens": 267853880.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.010649868287146091,
+      "sampling/sampling_logp_difference/max": 4.542207717895508,
+      "sampling/sampling_logp_difference/mean": 0.020365029573440552,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 4.812504812434781e-06,
+      "clip_ratio/high_mean": 1.2031262031086953e-06,
+      "clip_ratio/low_mean": 2.5999243803198624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.720237000630732e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6188.0078125,
+      "completions/mean_terminated_length": 5943.30419921875,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.7640773430466652,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003697809297591448,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 268665721.0,
+      "reward": 0.5078125,
+      "reward_std": 0.20699402689933777,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372363090515,
+      "sampling/importance_sampling_ratio/min": 0.02927250787615776,
+      "sampling/sampling_logp_difference/max": 3.531106472015381,
+      "sampling/sampling_logp_difference/mean": 0.016581017524003983,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1358927824621787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1358927824621787e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 8128.21875,
+      "completions/mean_terminated_length": 7861.90283203125,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.8218234181404114,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002286596456542611,
+      "learning_rate": 1e-05,
+      "loss": 0.0763,
+      "num_tokens": 269726181.0,
+      "reward": 0.375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999798536300659,
+      "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06,
+      "sampling/sampling_logp_difference/max": 12.90043830871582,
+      "sampling/sampling_logp_difference/mean": 0.019403984770178795,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 1.4808477317274082e-05,
+      "clip_ratio/high_mean": 3.7021193293185206e-06,
+      "clip_ratio/low_mean": 3.0363167581981543e-05,
+      "clip_ratio/low_min": 6.364238288369961e-06,
+      "clip_ratio/region_mean": 3.4065286854456645e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 5673.3359375,
+      "completions/mean_terminated_length": 5503.32568359375,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.9275510385632515,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00485506234690547,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 270470616.0,
+      "reward": 0.4921875,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.0009123464697040617,
+      "sampling/sampling_logp_difference/max": 6.999490737915039,
+      "sampling/sampling_logp_difference/mean": 0.01881871558725834,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 1.1274602456978755e-05,
+      "clip_ratio/high_mean": 3.6739949109687586e-06,
+      "clip_ratio/low_mean": 3.968570712231667e-05,
+      "clip_ratio/low_min": 3.4213767321489286e-06,
+      "clip_ratio/region_mean": 4.335970191959859e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 6944.8984375,
+      "completions/mean_terminated_length": 6795.07177734375,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9335741624236107,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005874342750757933,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 271377723.0,
+      "reward": 0.390625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000594854354858,
+      "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05,
+      "sampling/sampling_logp_difference/max": 10.049861907958984,
+      "sampling/sampling_logp_difference/mean": 0.020590776577591896,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 1.264126694877632e-05,
+      "clip_ratio/high_mean": 3.16031673719408e-06,
+      "clip_ratio/low_mean": 3.206376845810155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.522408474054828e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 7705.625,
+      "completions/mean_terminated_length": 7278.8193359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.8491624072194099,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001684082904830575,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 272384891.0,
+      "reward": 0.390625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 6.605865200981498e-05,
+      "sampling/sampling_logp_difference/max": 9.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.020136822015047073,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 9.772357770998497e-06,
+      "clip_ratio/high_mean": 2.443089442749624e-06,
+      "clip_ratio/low_mean": 3.8573590472879005e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.101667946088128e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6611.1484375,
+      "completions/mean_terminated_length": 6534.19677734375,
+      "completions/min_length": 1116.0,
+      "completions/min_terminated_length": 1116.0,
+      "entropy": 0.8867302760481834,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003692191792652011,
+      "learning_rate": 1e-05,
+      "loss": 0.1233,
+      "num_tokens": 273251630.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999606609344482,
+      "sampling/importance_sampling_ratio/min": 0.0031062732450664043,
+      "sampling/sampling_logp_difference/max": 5.774331569671631,
+      "sampling/sampling_logp_difference/mean": 0.019237037748098373,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 3.0103737344688852e-05,
+      "clip_ratio/high_mean": 9.664363972206047e-06,
+      "clip_ratio/low_mean": 1.7575501146893657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.723986426644842e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15786.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 6770.46875,
+      "completions/mean_terminated_length": 6770.46875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.8252957463264465,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004167635925114155,
+      "learning_rate": 1e-05,
+      "loss": -0.0072,
+      "num_tokens": 274146482.0,
+      "reward": 0.5703125,
+      "reward_std": 0.23486016690731049,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.00010247006866848096,
+      "sampling/sampling_logp_difference/max": 9.18593978881836,
+      "sampling/sampling_logp_difference/mean": 0.019684650003910065,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 6.529460733872838e-06,
+      "clip_ratio/high_mean": 1.6323651834682096e-06,
+      "clip_ratio/low_mean": 3.877351048231503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.040587566578324e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15827.0,
+      "completions/mean_length": 8210.859375,
+      "completions/mean_terminated_length": 7365.36181640625,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.8118235394358635,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030363225378096104,
+      "learning_rate": 1e-05,
+      "loss": 0.0531,
+      "num_tokens": 275214040.0,
+      "reward": 0.3515625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998943209648132,
+      "sampling/importance_sampling_ratio/min": 0.002854935359209776,
+      "sampling/sampling_logp_difference/max": 5.858705997467041,
+      "sampling/sampling_logp_difference/mean": 0.019275270402431488,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.0800629146106075e-06,
+      "clip_ratio/high_mean": 1.7700157286526519e-06,
+      "clip_ratio/low_mean": 2.3981688286767167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5751703674359305e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14900.0,
+      "completions/mean_length": 7072.8828125,
+      "completions/mean_terminated_length": 6849.41650390625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8018335327506065,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004777858033776283,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 276138049.0,
+      "reward": 0.453125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368190765381,
+      "sampling/importance_sampling_ratio/min": 0.0028502768836915493,
+      "sampling/sampling_logp_difference/max": 5.860339164733887,
+      "sampling/sampling_logp_difference/mean": 0.01849908009171486,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 2.259368602608447e-05,
+      "clip_ratio/high_mean": 5.648421506521117e-06,
+      "clip_ratio/low_mean": 4.28424866640853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.849090737479855e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14447.0,
+      "completions/mean_length": 5889.8359375,
+      "completions/mean_terminated_length": 5723.26220703125,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.7976400703191757,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030593445990234613,
+      "learning_rate": 1e-05,
+      "loss": 0.1331,
+      "num_tokens": 276910124.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999091029167175,
+      "sampling/importance_sampling_ratio/min": 0.000139843366923742,
+      "sampling/sampling_logp_difference/max": 8.874987602233887,
+      "sampling/sampling_logp_difference/mean": 0.01834402233362198,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 1.4654247024736833e-05,
+      "clip_ratio/high_mean": 3.663561756184208e-06,
+      "clip_ratio/low_mean": 2.377464920755301e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7438210736363544e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 7144.265625,
+      "completions/mean_terminated_length": 6689.85205078125,
+      "completions/min_length": 1200.0,
+      "completions/min_terminated_length": 1200.0,
+      "entropy": 0.8309404999017715,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004245694726705551,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 277843542.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998534321784973,
+      "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05,
+      "sampling/sampling_logp_difference/max": 11.499897956848145,
+      "sampling/sampling_logp_difference/mean": 0.01875344291329384,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 6.252500952541595e-06,
+      "clip_ratio/high_mean": 2.241558604509919e-06,
+      "clip_ratio/low_mean": 4.735765514851664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9599213525652885e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15722.0,
+      "completions/mean_length": 6779.5234375,
+      "completions/mean_terminated_length": 6703.8974609375,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.9584890529513359,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035574575886130333,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 278730129.0,
+      "reward": 0.3984375,
+      "reward_std": 0.32825323939323425,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.005792221520096064,
+      "sampling/sampling_logp_difference/max": 5.151239395141602,
+      "sampling/sampling_logp_difference/mean": 0.02137477695941925,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 3.2948471016425174e-05,
+      "clip_ratio/high_mean": 9.518853403278627e-06,
+      "clip_ratio/low_mean": 2.195712454522436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.14759782895635e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15892.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 5582.9765625,
+      "completions/mean_terminated_length": 5582.9765625,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8629376217722893,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037982752546668053,
+      "learning_rate": 1e-05,
+      "loss": 0.0331,
+      "num_tokens": 279462542.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3164186477661133,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999780058860779,
+      "sampling/importance_sampling_ratio/min": 0.0021874974481761456,
+      "sampling/sampling_logp_difference/max": 6.124997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01906203106045723,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.1029473625967512e-05,
+      "clip_ratio/high_mean": 2.757368406491878e-06,
+      "clip_ratio/low_mean": 5.367386921761863e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6431237737797346e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 6942.2578125,
+      "completions/mean_terminated_length": 6477.90966796875,
+      "completions/min_length": 1156.0,
+      "completions/min_terminated_length": 1156.0,
+      "entropy": 0.8147861957550049,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027678858023136854,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 280370207.0,
+      "reward": 0.4375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998471736907959,
+      "sampling/importance_sampling_ratio/min": 0.00023058800434228033,
+      "sampling/sampling_logp_difference/max": 8.3748779296875,
+      "sampling/sampling_logp_difference/mean": 0.01940828748047352,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 2.6367894406575942e-05,
+      "clip_ratio/high_mean": 8.765707434577052e-06,
+      "clip_ratio/low_mean": 3.232976985145797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.109547796815605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6242.53125,
+      "completions/mean_terminated_length": 5915.38671875,
+      "completions/min_length": 1220.0,
+      "completions/min_terminated_length": 1220.0,
+      "entropy": 0.878915011882782,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00577945914119482,
+      "learning_rate": 1e-05,
+      "loss": 0.0839,
+      "num_tokens": 281189491.0,
+      "reward": 0.515625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 9.611724817659706e-05,
+      "sampling/sampling_logp_difference/max": 9.2499418258667,
+      "sampling/sampling_logp_difference/mean": 0.01948760263621807,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 3.50839609382092e-05,
+      "clip_ratio/high_mean": 1.1664920634757436e-05,
+      "clip_ratio/low_mean": 1.833109013205103e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9996010880495305e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 7004.015625,
+      "completions/mean_terminated_length": 6622.71533203125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "entropy": 0.7964659407734871,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014128695474937558,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 282103997.0,
+      "reward": 0.4140625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.0024504722096025944,
+      "sampling/sampling_logp_difference/max": 6.011474609375,
+      "sampling/sampling_logp_difference/mean": 0.019019678235054016,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.832260545597819e-05,
+      "clip_ratio/high_mean": 4.580651363994548e-06,
+      "clip_ratio/low_mean": 5.309064226821647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.767129368905444e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7822.6953125,
+      "completions/mean_terminated_length": 7546.52392578125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.8571138679981232,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002476039342582226,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 283122382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999314546585083,
+      "sampling/importance_sampling_ratio/min": 0.0009774373611435294,
+      "sampling/sampling_logp_difference/max": 6.930576324462891,
+      "sampling/sampling_logp_difference/mean": 0.020557202398777008,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 5.738419986300869e-06,
+      "clip_ratio/high_mean": 1.4346049965752172e-06,
+      "clip_ratio/low_mean": 4.19679121819172e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3402517292179255e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7738.8984375,
+      "completions/mean_terminated_length": 6844.57763671875,
+      "completions/min_length": 897.0,
+      "completions/min_terminated_length": 897.0,
+      "entropy": 0.7839021533727646,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005309853237122297,
+      "learning_rate": 1e-05,
+      "loss": 0.043,
+      "num_tokens": 284130081.0,
+      "reward": 0.5234375,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998971223831177,
+      "sampling/importance_sampling_ratio/min": 0.0001319014554610476,
+      "sampling/sampling_logp_difference/max": 8.933455467224121,
+      "sampling/sampling_logp_difference/mean": 0.01873316988348961,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 1.007085802484653e-05,
+      "clip_ratio/high_mean": 2.5177145062116324e-06,
+      "clip_ratio/low_mean": 4.043528815600439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.295300277590286e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15952.0,
+      "completions/mean_length": 7102.2421875,
+      "completions/mean_terminated_length": 6954.9130859375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.8530801385641098,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004228116944432259,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 285058720.0,
+      "reward": 0.5078125,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00012956927821505815,
+      "sampling/sampling_logp_difference/max": 8.951294898986816,
+      "sampling/sampling_logp_difference/mean": 0.019325006753206253,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 4.06874551117653e-06,
+      "clip_ratio/high_mean": 1.0171863777941326e-06,
+      "clip_ratio/low_mean": 3.661125703047219e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.762844340826632e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15594.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6583.4765625,
+      "completions/mean_terminated_length": 6583.4765625,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 1.021921381354332,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004967439454048872,
+      "learning_rate": 1e-05,
+      "loss": 0.0374,
+      "num_tokens": 285919765.0,
+      "reward": 0.328125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00004243850708,
+      "sampling/importance_sampling_ratio/min": 0.016675354912877083,
+      "sampling/sampling_logp_difference/max": 4.093823432922363,
+      "sampling/sampling_logp_difference/mean": 0.021393200382590294,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.2215251445013564e-05,
+      "clip_ratio/high_mean": 3.053812861253391e-06,
+      "clip_ratio/low_mean": 4.05305947879242e-05,
+      "clip_ratio/low_min": 4.215567059873138e-06,
+      "clip_ratio/region_mean": 4.358440742180392e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16299.0,
+      "completions/mean_length": 7770.5859375,
+      "completions/mean_terminated_length": 7346.97509765625,
+      "completions/min_length": 1040.0,
+      "completions/min_terminated_length": 1040.0,
+      "entropy": 1.0466903448104858,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004189736675471067,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 286935512.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2369818240404129,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797344207764,
+      "sampling/importance_sampling_ratio/min": 0.011683559976518154,
+      "sampling/sampling_logp_difference/max": 4.449572563171387,
+      "sampling/sampling_logp_difference/mean": 0.021805983036756516,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 2.0567378214764176e-05,
+      "clip_ratio/high_mean": 5.141844553691044e-06,
+      "clip_ratio/low_mean": 1.8177100628236076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3318944840866607e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15758.0,
+      "completions/mean_length": 5689.2421875,
+      "completions/mean_terminated_length": 5432.568359375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.7778806164860725,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0032866497058421373,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 287681943.0,
+      "reward": 0.640625,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940812587738,
+      "sampling/importance_sampling_ratio/min": 0.00038077132194302976,
+      "sampling/sampling_logp_difference/max": 7.873311519622803,
+      "sampling/sampling_logp_difference/mean": 0.01789461076259613,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 3.109086901531555e-05,
+      "clip_ratio/high_mean": 7.772717253828887e-06,
+      "clip_ratio/low_mean": 3.1423560130861006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.919627738468989e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13820.0,
+      "completions/mean_length": 6288.1875,
+      "completions/mean_terminated_length": 6127.93701171875,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.7709921672940254,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023572889622300863,
+      "learning_rate": 1e-05,
+      "loss": 0.0746,
+      "num_tokens": 288506735.0,
+      "reward": 0.484375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474287033081,
+      "sampling/importance_sampling_ratio/min": 0.000430915504693985,
+      "sampling/sampling_logp_difference/max": 7.749598503112793,
+      "sampling/sampling_logp_difference/mean": 0.017407266423106194,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 3.4638953366084024e-05,
+      "clip_ratio/high_mean": 9.51674803673086e-06,
+      "clip_ratio/low_mean": 6.26047980176736e-05,
+      "clip_ratio/low_min": 5.51267930859467e-06,
+      "clip_ratio/region_mean": 7.212154741864651e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 6775.0234375,
+      "completions/mean_terminated_length": 6465.05615234375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9338318258523941,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034220058005303144,
+      "learning_rate": 1e-05,
+      "loss": 0.0986,
+      "num_tokens": 289395498.0,
+      "reward": 0.390625,
+      "reward_std": 0.34533774852752686,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603033065796,
+      "sampling/importance_sampling_ratio/min": 0.0317598432302475,
+      "sampling/sampling_logp_difference/max": 3.449552536010742,
+      "sampling/sampling_logp_difference/mean": 0.019930530339479446,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 7.159989991123439e-05,
+      "clip_ratio/low_min": 1.5592839645250933e-05,
+      "clip_ratio/region_mean": 7.159989991123439e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 7142.9375,
+      "completions/mean_terminated_length": 6844.83837890625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 0.971405878663063,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002513247774913907,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 290329082.0,
+      "reward": 0.328125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999737739562988,
+      "sampling/importance_sampling_ratio/min": 3.152207455059397e-07,
+      "sampling/sampling_logp_difference/max": 14.969992637634277,
+      "sampling/sampling_logp_difference/mean": 0.022366533055901527,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 1.6507752206962323e-05,
+      "clip_ratio/high_mean": 4.126938051740581e-06,
+      "clip_ratio/low_mean": 1.7493430505055585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1620368215735652e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15581.0,
+      "completions/mean_length": 6412.2109375,
+      "completions/mean_terminated_length": 6333.69287109375,
+      "completions/min_length": 544.0,
+      "completions/min_terminated_length": 544.0,
+      "entropy": 0.9136044681072235,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0056767817586660385,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 291170133.0,
+      "reward": 0.421875,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999720454216003,
+      "sampling/importance_sampling_ratio/min": 0.000458698661532253,
+      "sampling/sampling_logp_difference/max": 7.687117099761963,
+      "sampling/sampling_logp_difference/mean": 0.020012658089399338,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 8.26085442895419e-06,
+      "clip_ratio/high_mean": 2.0652136072385474e-06,
+      "clip_ratio/low_mean": 3.6938338666914206e-05,
+      "clip_ratio/low_min": 5.699044777429663e-06,
+      "clip_ratio/region_mean": 3.900355193309224e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16111.0,
+      "completions/mean_length": 8066.1015625,
+      "completions/mean_terminated_length": 7797.7822265625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 1.0789504647254944,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00243841833434999,
+      "learning_rate": 1e-05,
+      "loss": 0.0432,
+      "num_tokens": 292222082.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999664425849915,
+      "sampling/importance_sampling_ratio/min": 8.481895929435268e-05,
+      "sampling/sampling_logp_difference/max": 9.374991416931152,
+      "sampling/sampling_logp_difference/mean": 0.023650091141462326,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 5.320054697222076e-06,
+      "clip_ratio/high_mean": 1.330013674305519e-06,
+      "clip_ratio/low_mean": 1.9117383317279746e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0447396991585265e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15176.0,
+      "completions/mean_length": 6836.046875,
+      "completions/mean_terminated_length": 6606.896484375,
+      "completions/min_length": 785.0,
+      "completions/min_terminated_length": 785.0,
+      "entropy": 1.218759760260582,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0020856577903032303,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 293115984.0,
+      "reward": 0.21875,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999911785125732,
+      "sampling/importance_sampling_ratio/min": 2.784526441246271e-05,
+      "sampling/sampling_logp_difference/max": 10.488847732543945,
+      "sampling/sampling_logp_difference/mean": 0.022012067958712578,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 2.5695502699818462e-05,
+      "clip_ratio/high_mean": 7.549717793153832e-06,
+      "clip_ratio/low_mean": 4.6741323160404136e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.429104089671455e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15796.0,
+      "completions/mean_length": 7501.9921875,
+      "completions/mean_terminated_length": 7140.9345703125,
+      "completions/min_length": 1237.0,
+      "completions/min_terminated_length": 1237.0,
+      "entropy": 0.8940394818782806,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005163854919373989,
+      "learning_rate": 1e-05,
+      "loss": 0.0354,
+      "num_tokens": 294099503.0,
+      "reward": 0.328125,
+      "reward_std": 0.30904707312583923,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999276399612427,
+      "sampling/importance_sampling_ratio/min": 0.0006545600481331348,
+      "sampling/sampling_logp_difference/max": 7.331547260284424,
+      "sampling/sampling_logp_difference/mean": 0.020813245326280594,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 3.1606674838258186e-05,
+      "clip_ratio/high_mean": 9.45794374729303e-06,
+      "clip_ratio/low_mean": 4.5567895540443715e-05,
+      "clip_ratio/low_min": 4.458871444512624e-06,
+      "clip_ratio/region_mean": 5.502583962879726e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7204.828125,
+      "completions/mean_terminated_length": 6908.7255859375,
+      "completions/min_length": 846.0,
+      "completions/min_terminated_length": 846.0,
+      "entropy": 0.9961872175335884,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029277894645929337,
+      "learning_rate": 1e-05,
+      "loss": 0.0963,
+      "num_tokens": 295042105.0,
+      "reward": 0.390625,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000677108764648,
+      "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05,
+      "sampling/sampling_logp_difference/max": 10.872637748718262,
+      "sampling/sampling_logp_difference/mean": 0.020187582820653915,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 1.7963964182854397e-05,
+      "clip_ratio/high_mean": 5.194059781388205e-06,
+      "clip_ratio/low_mean": 1.8380221035840805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.357428081722901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15856.0,
+      "completions/mean_length": 6256.859375,
+      "completions/mean_terminated_length": 6013.80810546875,
+      "completions/min_length": 1006.0,
+      "completions/min_terminated_length": 1006.0,
+      "entropy": 0.9293600022792816,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032952844630926847,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 295867039.0,
+      "reward": 0.46875,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999649524688721,
+      "sampling/importance_sampling_ratio/min": 7.995560008566827e-05,
+      "sampling/sampling_logp_difference/max": 9.434039115905762,
+      "sampling/sampling_logp_difference/mean": 0.019491540268063545,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 7.577551059512189e-06,
+      "clip_ratio/high_mean": 1.8943877648780472e-06,
+      "clip_ratio/low_mean": 2.7479814093567256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9374201631071628e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15412.0,
+      "completions/mean_length": 7397.84375,
+      "completions/mean_terminated_length": 7032.552734375,
+      "completions/min_length": 923.0,
+      "completions/min_terminated_length": 923.0,
+      "entropy": 0.8508890569210052,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029417150653898716,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 296832843.0,
+      "reward": 0.375,
+      "reward_std": 0.2867125868797302,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000183582305908,
+      "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05,
+      "sampling/sampling_logp_difference/max": 10.93724250793457,
+      "sampling/sampling_logp_difference/mean": 0.01975393109023571,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 3.281225508544594e-05,
+      "clip_ratio/high_mean": 1.3302957199812226e-05,
+      "clip_ratio/low_mean": 5.109179869577929e-05,
+      "clip_ratio/low_min": 6.657612175331451e-06,
+      "clip_ratio/region_mean": 6.439475532715733e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14983.0,
+      "completions/mean_length": 6897.765625,
+      "completions/mean_terminated_length": 6823.07080078125,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.9046694040298462,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026788609102368355,
+      "learning_rate": 1e-05,
+      "loss": 0.0664,
+      "num_tokens": 297735285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3266732692718506,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999909520149231,
+      "sampling/importance_sampling_ratio/min": 0.001710799871943891,
+      "sampling/sampling_logp_difference/max": 6.370794296264648,
+      "sampling/sampling_logp_difference/mean": 0.020578179508447647,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 1.7319889593636617e-05,
+      "clip_ratio/high_mean": 5.168538336874917e-06,
+      "clip_ratio/low_mean": 7.019768918326008e-05,
+      "clip_ratio/low_min": 2.541147478041239e-05,
+      "clip_ratio/region_mean": 7.53662266106403e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15525.0,
+      "completions/mean_length": 6971.9921875,
+      "completions/mean_terminated_length": 6509.10595703125,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "entropy": 0.8658201694488525,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005915141198784113,
+      "learning_rate": 1e-05,
+      "loss": 0.0923,
+      "num_tokens": 298645124.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3742823898792267,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999268651008606,
+      "sampling/importance_sampling_ratio/min": 0.000970841443631798,
+      "sampling/sampling_logp_difference/max": 6.937347412109375,
+      "sampling/sampling_logp_difference/mean": 0.01906151883304119,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.8332865238335216e-05,
+      "clip_ratio/high_mean": 4.583216309583804e-06,
+      "clip_ratio/low_mean": 6.167940273371642e-05,
+      "clip_ratio/low_min": 5.969151516183047e-06,
+      "clip_ratio/region_mean": 6.626261847486603e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15054.0,
+      "completions/mean_length": 6545.6953125,
+      "completions/mean_terminated_length": 5889.80859375,
+      "completions/min_length": 800.0,
+      "completions/min_terminated_length": 800.0,
+      "entropy": 0.779609851539135,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0032792428974062204,
+      "learning_rate": 1e-05,
+      "loss": 0.097,
+      "num_tokens": 299503781.0,
+      "reward": 0.609375,
+      "reward_std": 0.38293448090553284,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999361634254456,
+      "sampling/importance_sampling_ratio/min": 0.002187495119869709,
+      "sampling/sampling_logp_difference/max": 6.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.017413027584552765,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.46246323235755e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.46246323235755e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7226.515625,
+      "completions/mean_terminated_length": 7006.736328125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9573849961161613,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005092279519885778,
+      "learning_rate": 1e-05,
+      "loss": 0.1102,
+      "num_tokens": 300447903.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999373555183411,
+      "sampling/importance_sampling_ratio/min": 0.000627054600045085,
+      "sampling/sampling_logp_difference/max": 7.374476909637451,
+      "sampling/sampling_logp_difference/mean": 0.021570835262537003,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 5.487269390869187e-06,
+      "clip_ratio/high_mean": 1.3718173477172968e-06,
+      "clip_ratio/low_mean": 4.7280102080549113e-05,
+      "clip_ratio/low_min": 1.0166083029616857e-05,
+      "clip_ratio/region_mean": 4.865191931457957e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14967.0,
+      "completions/mean_length": 5755.171875,
+      "completions/mean_terminated_length": 5323.10546875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8482184633612633,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005033228080719709,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 301206021.0,
+      "reward": 0.390625,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999947547912598,
+      "sampling/importance_sampling_ratio/min": 0.0014573346124961972,
+      "sampling/sampling_logp_difference/max": 6.531146049499512,
+      "sampling/sampling_logp_difference/mean": 0.018870476633310318,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 5.421346941147931e-06,
+      "clip_ratio/high_mean": 1.3553367352869827e-06,
+      "clip_ratio/low_mean": 1.6510994441887306e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.786633117717429e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 7098.7265625,
+      "completions/mean_terminated_length": 6875.88037109375,
+      "completions/min_length": 947.0,
+      "completions/min_terminated_length": 947.0,
+      "entropy": 0.87320177257061,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.007659573573619127,
+      "learning_rate": 1e-05,
+      "loss": 0.0707,
+      "num_tokens": 302133890.0,
+      "reward": 0.421875,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0012466582702472806,
+      "sampling/sampling_logp_difference/max": 6.687288761138916,
+      "sampling/sampling_logp_difference/mean": 0.019994346424937248,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 1.1556229310372146e-05,
+      "clip_ratio/high_mean": 2.8890573275930365e-06,
+      "clip_ratio/low_mean": 3.8744643916288624e-05,
+      "clip_ratio/low_min": 6.108287834649673e-06,
+      "clip_ratio/region_mean": 4.1633702039689524e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16139.0,
+      "completions/mean_length": 6399.96875,
+      "completions/mean_terminated_length": 6077.90283203125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9481896534562111,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014135175151750445,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 302972566.0,
+      "reward": 0.4140625,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0025698256213217974,
+      "sampling/sampling_logp_difference/max": 5.963917255401611,
+      "sampling/sampling_logp_difference/mean": 0.02073008380830288,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 6.59491388432798e-06,
+      "clip_ratio/high_mean": 2.545892130001448e-06,
+      "clip_ratio/low_mean": 4.620846755187813e-05,
+      "clip_ratio/low_min": 6.243132702365983e-06,
+      "clip_ratio/region_mean": 4.875435956819274e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 7298.078125,
+      "completions/mean_terminated_length": 7226.53564453125,
+      "completions/min_length": 1009.0,
+      "completions/min_terminated_length": 1009.0,
+      "entropy": 0.8719206526875496,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027898226398974657,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 303925976.0,
+      "reward": 0.484375,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.005236432887613773,
+      "sampling/sampling_logp_difference/max": 5.252114772796631,
+      "sampling/sampling_logp_difference/mean": 0.020944103598594666,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 1.052124343914329e-05,
+      "clip_ratio/high_mean": 2.6303108597858227e-06,
+      "clip_ratio/low_mean": 2.010384196182713e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.273415248055244e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14980.0,
+      "completions/mean_length": 5667.0390625,
+      "completions/mean_terminated_length": 5496.9287109375,
+      "completions/min_length": 974.0,
+      "completions/min_terminated_length": 974.0,
+      "entropy": 0.8791451379656792,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012764945859089494,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 304675157.0,
+      "reward": 0.390625,
+      "reward_std": 0.17965976893901825,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000383853912354,
+      "sampling/importance_sampling_ratio/min": 5.054428584116977e-06,
+      "sampling/sampling_logp_difference/max": 12.195245742797852,
+      "sampling/sampling_logp_difference/mean": 0.018928447738289833,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 9.578045592206763e-06,
+      "clip_ratio/high_mean": 2.3945113980516908e-06,
+      "clip_ratio/low_mean": 3.1114799753595435e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350931149270764e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15354.0,
+      "completions/max_terminated_length": 15354.0,
+      "completions/mean_length": 5874.4453125,
+      "completions/mean_terminated_length": 5874.4453125,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9577538818120956,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00509974779561162,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 305447038.0,
+      "reward": 0.515625,
+      "reward_std": 0.24777325987815857,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999423027038574,
+      "sampling/importance_sampling_ratio/min": 0.004791648127138615,
+      "sampling/sampling_logp_difference/max": 5.340880870819092,
+      "sampling/sampling_logp_difference/mean": 0.02114470861852169,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 1.0903062275247066e-05,
+      "clip_ratio/high_mean": 2.7257655688117666e-06,
+      "clip_ratio/low_mean": 4.784364205079328e-05,
+      "clip_ratio/low_min": 3.861600362142781e-06,
+      "clip_ratio/region_mean": 5.056940744907479e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 6197.5703125,
+      "completions/mean_terminated_length": 6035.88134765625,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.8665244281291962,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030849494505673647,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 306258023.0,
+      "reward": 0.515625,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998056888580322,
+      "sampling/importance_sampling_ratio/min": 0.000830297009088099,
+      "sampling/sampling_logp_difference/max": 7.093727111816406,
+      "sampling/sampling_logp_difference/mean": 0.021017421036958694,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 1.4299712574938894e-05,
+      "clip_ratio/high_mean": 4.3520980170796975e-06,
+      "clip_ratio/low_mean": 6.213493452378316e-05,
+      "clip_ratio/low_min": 1.0056635801447555e-05,
+      "clip_ratio/region_mean": 6.648703174505499e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 7522.578125,
+      "completions/mean_terminated_length": 7381.9208984375,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.8185881152749062,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002946985885500908,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 307240305.0,
+      "reward": 0.3125,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.005127199459820986,
+      "sampling/sampling_logp_difference/max": 5.273195743560791,
+      "sampling/sampling_logp_difference/mean": 0.01965932548046112,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.693051035545068e-05,
+      "clip_ratio/high_mean": 5.08456730585749e-06,
+      "clip_ratio/low_mean": 4.2052345861520735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.713691282631771e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14090.0,
+      "completions/mean_length": 6403.2265625,
+      "completions/mean_terminated_length": 6163.6884765625,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "entropy": 0.8359840363264084,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031181599479168653,
+      "learning_rate": 1e-05,
+      "loss": 0.072,
+      "num_tokens": 308079318.0,
+      "reward": 0.5,
+      "reward_std": 0.27145031094551086,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999215602874756,
+      "sampling/importance_sampling_ratio/min": 6.73715621815063e-05,
+      "sampling/sampling_logp_difference/max": 9.605287551879883,
+      "sampling/sampling_logp_difference/mean": 0.01963040418922901,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 1.3988919135954347e-05,
+      "clip_ratio/high_mean": 3.497229783988587e-06,
+      "clip_ratio/low_mean": 6.722658486069122e-05,
+      "clip_ratio/low_min": 1.858519090092159e-05,
+      "clip_ratio/region_mean": 7.072381458783639e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16148.0,
+      "completions/mean_length": 7954.03125,
+      "completions/mean_terminated_length": 7751.71240234375,
+      "completions/min_length": 632.0,
+      "completions/min_terminated_length": 632.0,
+      "entropy": 0.905990719795227,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002656223252415657,
+      "learning_rate": 1e-05,
+      "loss": 0.1022,
+      "num_tokens": 309117770.0,
+      "reward": 0.3828125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999536275863647,
+      "sampling/importance_sampling_ratio/min": 0.0003354826185386628,
+      "sampling/sampling_logp_difference/max": 7.999940395355225,
+      "sampling/sampling_logp_difference/mean": 0.020741507411003113,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.7610595023143105e-05,
+      "clip_ratio/high_mean": 4.402648755785776e-06,
+      "clip_ratio/low_mean": 4.337988764291367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.778253651238629e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6630.09375,
+      "completions/mean_terminated_length": 6315.45166015625,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.870736837387085,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0060529084876179695,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 309988894.0,
+      "reward": 0.515625,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998822212219238,
+      "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05,
+      "sampling/sampling_logp_difference/max": 10.716434478759766,
+      "sampling/sampling_logp_difference/mean": 0.02060208097100258,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 1.0448093235027045e-05,
+      "clip_ratio/high_mean": 2.6120233087567613e-06,
+      "clip_ratio/low_mean": 3.1030769946482906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.364279325523967e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15920.0,
+      "completions/max_terminated_length": 15920.0,
+      "completions/mean_length": 6679.6171875,
+      "completions/mean_terminated_length": 6679.6171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9812518879771233,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00400698184967041,
+      "learning_rate": 1e-05,
+      "loss": 0.0605,
+      "num_tokens": 310864013.0,
+      "reward": 0.421875,
+      "reward_std": 0.3295465111732483,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999049305915833,
+      "sampling/importance_sampling_ratio/min": 0.0020593837834894657,
+      "sampling/sampling_logp_difference/max": 6.1853485107421875,
+      "sampling/sampling_logp_difference/mean": 0.02098071575164795,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 2.124982574969181e-05,
+      "clip_ratio/high_mean": 7.736592579021817e-06,
+      "clip_ratio/low_mean": 2.900951585615985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.674610888992902e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14541.0,
+      "completions/mean_length": 5523.796875,
+      "completions/mean_terminated_length": 5173.4677734375,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9120645374059677,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005929585546255112,
+      "learning_rate": 1e-05,
+      "loss": 0.0362,
+      "num_tokens": 311589987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998446702957153,
+      "sampling/importance_sampling_ratio/min": 0.0010661041596904397,
+      "sampling/sampling_logp_difference/max": 6.843744277954102,
+      "sampling/sampling_logp_difference/mean": 0.019948206841945648,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 2.4486997745043482e-05,
+      "clip_ratio/high_mean": 8.219769085826556e-06,
+      "clip_ratio/low_mean": 5.346400575945154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.168377467474784e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15401.0,
+      "completions/mean_length": 6361.3671875,
+      "completions/mean_terminated_length": 6282.44873046875,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.8044678047299385,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006622390355914831,
+      "learning_rate": 1e-05,
+      "loss": 0.1023,
+      "num_tokens": 312424034.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3724474310874939,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000219345092773,
+      "sampling/importance_sampling_ratio/min": 0.0003157092141918838,
+      "sampling/sampling_logp_difference/max": 8.060688972473145,
+      "sampling/sampling_logp_difference/mean": 0.018907658755779266,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 1.0407376748844399e-05,
+      "clip_ratio/high_mean": 2.6018441872110998e-06,
+      "clip_ratio/low_mean": 5.925514369664597e-05,
+      "clip_ratio/low_min": 1.3324347946763737e-05,
+      "clip_ratio/region_mean": 6.185698703120579e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15883.0,
+      "completions/mean_length": 7109.0,
+      "completions/mean_terminated_length": 7035.96826171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9167275875806808,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004639944992959499,
+      "learning_rate": 1e-05,
+      "loss": 0.0861,
+      "num_tokens": 313353346.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3826971650123596,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999389052391052,
+      "sampling/importance_sampling_ratio/min": 0.0019070414127781987,
+      "sampling/sampling_logp_difference/max": 6.262202262878418,
+      "sampling/sampling_logp_difference/mean": 0.02155841514468193,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 3.959046694035351e-05,
+      "clip_ratio/high_mean": 1.0912523691786191e-05,
+      "clip_ratio/low_mean": 3.3944450819944905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.485697365907981e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6314.2734375,
+      "completions/mean_terminated_length": 6072.60009765625,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.8780038207769394,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.007643720600754023,
+      "learning_rate": 1e-05,
+      "loss": 0.0873,
+      "num_tokens": 314180717.0,
+      "reward": 0.4609375,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999802112579346,
+      "sampling/importance_sampling_ratio/min": 0.021285315975546837,
+      "sampling/sampling_logp_difference/max": 3.8497378826141357,
+      "sampling/sampling_logp_difference/mean": 0.01964358240365982,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 3.065382111344661e-05,
+      "clip_ratio/high_mean": 9.187473835936544e-06,
+      "clip_ratio/low_mean": 4.137891801292426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.056639065514901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6718.2265625,
+      "completions/mean_terminated_length": 6486.24853515625,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.8326799497008324,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050973957404494286,
+      "learning_rate": 1e-05,
+      "loss": 0.0109,
+      "num_tokens": 315060842.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3521803915500641,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000014066696167,
+      "sampling/importance_sampling_ratio/min": 0.0009130688849836588,
+      "sampling/sampling_logp_difference/max": 6.998699188232422,
+      "sampling/sampling_logp_difference/mean": 0.019501537084579468,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 8.624853762739804e-06,
+      "clip_ratio/high_mean": 2.156213440684951e-06,
+      "clip_ratio/low_mean": 1.8797969062234188e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0954182048171788e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16128.0,
+      "completions/mean_length": 8666.8359375,
+      "completions/mean_terminated_length": 7941.291015625,
+      "completions/min_length": 565.0,
+      "completions/min_terminated_length": 565.0,
+      "entropy": 0.9526705741882324,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019092690199613571,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 316190325.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999814629554749,
+      "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05,
+      "sampling/sampling_logp_difference/max": 10.249995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02051631174981594,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 2.147400391550036e-05,
+      "clip_ratio/high_mean": 6.434908300434472e-06,
+      "clip_ratio/low_mean": 3.521234066283796e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.164724816746457e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15164.0,
+      "completions/mean_length": 7661.8203125,
+      "completions/mean_terminated_length": 7002.16015625,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 0.8322782590985298,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019530428107827902,
+      "learning_rate": 1e-05,
+      "loss": 0.0729,
+      "num_tokens": 317191878.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21382391452789307,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 8.546619210392237e-05,
+      "sampling/sampling_logp_difference/max": 9.367389678955078,
+      "sampling/sampling_logp_difference/mean": 0.019894573837518692,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.9436202364886412e-05,
+      "clip_ratio/high_mean": 6.089704697842535e-06,
+      "clip_ratio/low_mean": 4.2698405422925134e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.878810955233348e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15934.0,
+      "completions/mean_length": 7024.859375,
+      "completions/mean_terminated_length": 6800.240234375,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.794853538274765,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031784537713974714,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 318109004.0,
+      "reward": 0.4921875,
+      "reward_std": 0.31800347566604614,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999352693557739,
+      "sampling/importance_sampling_ratio/min": 0.0002962362195830792,
+      "sampling/sampling_logp_difference/max": 8.124353408813477,
+      "sampling/sampling_logp_difference/mean": 0.018519200384616852,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 4.127455667912727e-06,
+      "clip_ratio/high_mean": 1.0318639169781818e-06,
+      "clip_ratio/low_mean": 4.342453667049995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.445640047379129e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 7282.1796875,
+      "completions/mean_terminated_length": 6912.1865234375,
+      "completions/min_length": 870.0,
+      "completions/min_terminated_length": 870.0,
+      "entropy": 0.904067650437355,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005080109462141991,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 319059075.0,
+      "reward": 0.4140625,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000062108039856,
+      "sampling/importance_sampling_ratio/min": 0.1194523349404335,
+      "sampling/sampling_logp_difference/max": 6.136754989624023,
+      "sampling/sampling_logp_difference/mean": 0.019978653639554977,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.608940076243016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.608940076243016e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15625.0,
+      "completions/mean_length": 7131.5234375,
+      "completions/mean_terminated_length": 6596.255859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.8849587142467499,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022667953744530678,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 319990046.0,
+      "reward": 0.46875,
+      "reward_std": 0.30221715569496155,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0370909757912159,
+      "sampling/sampling_logp_difference/max": 3.294381618499756,
+      "sampling/sampling_logp_difference/mean": 0.02037571743130684,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 1.5356635913121863e-05,
+      "clip_ratio/high_mean": 3.839158978280466e-06,
+      "clip_ratio/low_mean": 3.4950805911648786e-05,
+      "clip_ratio/low_min": 4.876336333836662e-06,
+      "clip_ratio/region_mean": 3.8789965287833184e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16205.0,
+      "completions/mean_length": 6655.4453125,
+      "completions/mean_terminated_length": 6578.84228515625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.7417122721672058,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00216497085057199,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 320860135.0,
+      "reward": 0.5625,
+      "reward_std": 0.3369230031967163,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 0.0005190494703128934,
+      "sampling/sampling_logp_difference/max": 7.563511371612549,
+      "sampling/sampling_logp_difference/mean": 0.01771342009305954,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 1.7605634639039636e-05,
+      "clip_ratio/high_mean": 5.297029474604642e-06,
+      "clip_ratio/low_mean": 5.688933060810086e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.218636053745286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15849.0,
+      "completions/mean_length": 7077.1640625,
+      "completions/mean_terminated_length": 6619.45068359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 0.8749325424432755,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0028338562697172165,
+      "learning_rate": 1e-05,
+      "loss": 0.0643,
+      "num_tokens": 321783852.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998220205307007,
+      "sampling/importance_sampling_ratio/min": 7.83290306571871e-06,
+      "sampling/sampling_logp_difference/max": 11.757177352905273,
+      "sampling/sampling_logp_difference/mean": 0.020299233496189117,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 7.301828190975357e-06,
+      "clip_ratio/high_mean": 1.8254570477438392e-06,
+      "clip_ratio/low_mean": 5.158197632226802e-05,
+      "clip_ratio/low_min": 3.735804057214409e-06,
+      "clip_ratio/region_mean": 5.340743223314348e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15329.0,
+      "completions/mean_length": 6034.296875,
+      "completions/mean_terminated_length": 5525.294921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.80014718323946,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022897711023688316,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 322572882.0,
+      "reward": 0.40625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999347925186157,
+      "sampling/importance_sampling_ratio/min": 0.0004105660773348063,
+      "sampling/sampling_logp_difference/max": 7.7979736328125,
+      "sampling/sampling_logp_difference/mean": 0.01858348958194256,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 9.364057859784225e-06,
+      "clip_ratio/high_mean": 3.351393047523743e-06,
+      "clip_ratio/low_mean": 4.186752630630508e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5218919240141986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 8172.109375,
+      "completions/mean_terminated_length": 7838.29248046875,
+      "completions/min_length": 733.0,
+      "completions/min_terminated_length": 733.0,
+      "entropy": 0.8732693120837212,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003263789461925626,
+      "learning_rate": 1e-05,
+      "loss": 0.0356,
+      "num_tokens": 323640904.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3237774670124054,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999354481697083,
+      "sampling/importance_sampling_ratio/min": 9.27252222027164e-06,
+      "sampling/sampling_logp_difference/max": 11.588455200195312,
+      "sampling/sampling_logp_difference/mean": 0.0208889190107584,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 2.0998899799451465e-05,
+      "clip_ratio/high_mean": 6.692962131182867e-06,
+      "clip_ratio/low_mean": 4.261424010110204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.930720297124935e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 7699.203125,
+      "completions/mean_terminated_length": 7419.04833984375,
+      "completions/min_length": 1225.0,
+      "completions/min_terminated_length": 1225.0,
+      "entropy": 0.8296505436301231,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0042716520838439465,
+      "learning_rate": 1e-05,
+      "loss": 0.0937,
+      "num_tokens": 324643858.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874234199524,
+      "sampling/importance_sampling_ratio/min": 0.00022192654432728887,
+      "sampling/sampling_logp_difference/max": 8.413164138793945,
+      "sampling/sampling_logp_difference/mean": 0.018926654011011124,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 7.061349151626928e-06,
+      "clip_ratio/high_mean": 1.765337287906732e-06,
+      "clip_ratio/low_mean": 4.5005243464402156e-05,
+      "clip_ratio/low_min": 3.861838649754645e-06,
+      "clip_ratio/region_mean": 4.6770580411248375e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16364.0,
+      "completions/max_terminated_length": 16364.0,
+      "completions/mean_length": 7450.1640625,
+      "completions/mean_terminated_length": 7450.1640625,
+      "completions/min_length": 910.0,
+      "completions/min_terminated_length": 910.0,
+      "entropy": 1.0400195196270943,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033558050636202097,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 325617687.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999459385871887,
+      "sampling/importance_sampling_ratio/min": 0.039920732378959656,
+      "sampling/sampling_logp_difference/max": 3.2208595275878906,
+      "sampling/sampling_logp_difference/mean": 0.02249298244714737,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 1.3147802746971138e-05,
+      "clip_ratio/high_mean": 3.2869506867427845e-06,
+      "clip_ratio/low_mean": 2.4451034505545977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7737984851228248e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15342.0,
+      "completions/mean_length": 6799.0703125,
+      "completions/mean_terminated_length": 6723.5986328125,
+      "completions/min_length": 1708.0,
+      "completions/min_terminated_length": 1708.0,
+      "entropy": 0.9737623482942581,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005797459278255701,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 326508384.0,
+      "reward": 0.3125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999321699142456,
+      "sampling/importance_sampling_ratio/min": 7.535634836131067e-07,
+      "sampling/sampling_logp_difference/max": 14.0984525680542,
+      "sampling/sampling_logp_difference/mean": 0.021543748676776886,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 3.3594023989280686e-06,
+      "clip_ratio/high_mean": 8.398505997320171e-07,
+      "clip_ratio/low_mean": 2.3457610382138228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4297460981870245e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 7034.3671875,
+      "completions/mean_terminated_length": 6654.30078125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8749603256583214,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002258980879560113,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 327426407.0,
+      "reward": 0.4609375,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999661445617676,
+      "sampling/importance_sampling_ratio/min": 0.008719252422451973,
+      "sampling/sampling_logp_difference/max": 4.742221832275391,
+      "sampling/sampling_logp_difference/mean": 0.01997346058487892,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 2.823375348270929e-05,
+      "clip_ratio/high_mean": 7.058438370677322e-06,
+      "clip_ratio/low_mean": 4.9395109726901865e-05,
+      "clip_ratio/low_min": 1.636556044104509e-05,
+      "clip_ratio/region_mean": 5.6453548268109444e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15240.0,
+      "completions/mean_length": 6623.078125,
+      "completions/mean_terminated_length": 6388.81640625,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.858784057199955,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002420129720121622,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 328292985.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537417411804,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998596906661987,
+      "sampling/importance_sampling_ratio/min": 0.00014900295354891568,
+      "sampling/sampling_logp_difference/max": 8.811544418334961,
+      "sampling/sampling_logp_difference/mean": 0.019645996391773224,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 1.8078507309837732e-05,
+      "clip_ratio/high_mean": 6.468551191574079e-06,
+      "clip_ratio/low_mean": 4.051302585139638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.698157727034413e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15229.0,
+      "completions/mean_length": 5902.4765625,
+      "completions/mean_terminated_length": 5564.36279296875,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.904740035533905,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004107976797968149,
+      "learning_rate": 1e-05,
+      "loss": 0.0824,
+      "num_tokens": 329067006.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3945493996143341,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999526143074036,
+      "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05,
+      "sampling/sampling_logp_difference/max": 11.37439250946045,
+      "sampling/sampling_logp_difference/mean": 0.019582755863666534,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 2.553658168835682e-05,
+      "clip_ratio/high_mean": 7.276365181496658e-06,
+      "clip_ratio/low_mean": 1.7552573126522475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.482893796695862e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6425.6015625,
+      "completions/mean_terminated_length": 6267.5322265625,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.964553713798523,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003208522219210863,
+      "learning_rate": 1e-05,
+      "loss": 0.0164,
+      "num_tokens": 329910691.0,
+      "reward": 0.359375,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999419450759888,
+      "sampling/importance_sampling_ratio/min": 0.00137569778598845,
+      "sampling/sampling_logp_difference/max": 6.588794231414795,
+      "sampling/sampling_logp_difference/mean": 0.021154657006263733,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 6.8712420215888415e-06,
+      "clip_ratio/high_mean": 1.7178105053972104e-06,
+      "clip_ratio/low_mean": 4.0991827404468495e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2709637853022286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 8006.4453125,
+      "completions/mean_terminated_length": 7594.43408203125,
+      "completions/min_length": 1235.0,
+      "completions/min_terminated_length": 1235.0,
+      "entropy": 0.8980336412787437,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002898421371355653,
+      "learning_rate": 1e-05,
+      "loss": 0.0815,
+      "num_tokens": 330956332.0,
+      "reward": 0.4296875,
+      "reward_std": 0.20175684988498688,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 9.378339746035635e-05,
+      "sampling/sampling_logp_difference/max": 9.27452278137207,
+      "sampling/sampling_logp_difference/mean": 0.021021340042352676,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.2689344689297286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2689344689297286e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15484.0,
+      "completions/max_terminated_length": 15484.0,
+      "completions/mean_length": 7068.828125,
+      "completions/mean_terminated_length": 7068.828125,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.9865007549524307,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0037063576746731997,
+      "learning_rate": 1e-05,
+      "loss": 0.0313,
+      "num_tokens": 331880918.0,
+      "reward": 0.3203125,
+      "reward_std": 0.17859892547130585,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0001819290773710236,
+      "sampling/sampling_logp_difference/max": 8.611893653869629,
+      "sampling/sampling_logp_difference/mean": 0.02072504535317421,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 5.845633268108941e-06,
+      "clip_ratio/high_mean": 1.4614083170272352e-06,
+      "clip_ratio/low_mean": 3.207486906831036e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353627721480734e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 7379.390625,
+      "completions/mean_terminated_length": 7236.4609375,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.8977236375212669,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001972826896235347,
+      "learning_rate": 1e-05,
+      "loss": 0.0228,
+      "num_tokens": 332849112.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 2.820451663865242e-05,
+      "sampling/sampling_logp_difference/max": 10.476028442382812,
+      "sampling/sampling_logp_difference/mean": 0.019411223009228706,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 4.875385002378607e-06,
+      "clip_ratio/high_mean": 1.2188462505946518e-06,
+      "clip_ratio/low_mean": 2.3530714997832547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.47495612484272e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15517.0,
+      "completions/mean_length": 6867.9609375,
+      "completions/mean_terminated_length": 6793.03125,
+      "completions/min_length": 760.0,
+      "completions/min_terminated_length": 760.0,
+      "entropy": 0.9244343340396881,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.006926023401319981,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 333746179.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1433562934398651,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.0003875594411510974,
+      "sampling/sampling_logp_difference/max": 7.8556413650512695,
+      "sampling/sampling_logp_difference/mean": 0.020311862230300903,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 1.5651628245905158e-05,
+      "clip_ratio/high_mean": 4.836261211949022e-06,
+      "clip_ratio/low_mean": 5.268017821435933e-05,
+      "clip_ratio/low_min": 3.950945028918795e-06,
+      "clip_ratio/region_mean": 5.751643902840442e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 7525.375,
+      "completions/mean_terminated_length": 6855.3955078125,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9207312315702438,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0047226278111338615,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 334731027.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3353874683380127,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999615550041199,
+      "sampling/importance_sampling_ratio/min": 0.00029753465787507594,
+      "sampling/sampling_logp_difference/max": 8.119979858398438,
+      "sampling/sampling_logp_difference/mean": 0.021496692672371864,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 3.815379886873416e-05,
+      "clip_ratio/high_mean": 9.53844971718354e-06,
+      "clip_ratio/low_mean": 4.519663821156428e-05,
+      "clip_ratio/low_min": 2.775434040813707e-06,
+      "clip_ratio/region_mean": 5.473508826980833e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16251.0,
+      "completions/mean_length": 6841.0625,
+      "completions/mean_terminated_length": 6453.13818359375,
+      "completions/min_length": 689.0,
+      "completions/min_terminated_length": 689.0,
+      "entropy": 0.8979457840323448,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004971448332071304,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 335631243.0,
+      "reward": 0.390625,
+      "reward_std": 0.2596156895160675,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999934196472168,
+      "sampling/importance_sampling_ratio/min": 9.655764188210014e-06,
+      "sampling/sampling_logp_difference/max": 11.547955513000488,
+      "sampling/sampling_logp_difference/mean": 0.020256079733371735,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 4.162365712545579e-06,
+      "clip_ratio/high_mean": 1.0405914281363948e-06,
+      "clip_ratio/low_mean": 3.1563491688757495e-05,
+      "clip_ratio/low_min": 3.1228139505401487e-06,
+      "clip_ratio/region_mean": 3.260408311689389e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15060.0,
+      "completions/mean_length": 6919.8046875,
+      "completions/mean_terminated_length": 6454.35205078125,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9241961911320686,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038604787550866604,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 336537162.0,
+      "reward": 0.375,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998080730438232,
+      "sampling/importance_sampling_ratio/min": 0.0009118975722230971,
+      "sampling/sampling_logp_difference/max": 6.999982833862305,
+      "sampling/sampling_logp_difference/mean": 0.02030865103006363,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 6.5182248363271356e-06,
+      "clip_ratio/high_mean": 1.6295562090817839e-06,
+      "clip_ratio/low_mean": 4.3847362121596234e-05,
+      "clip_ratio/low_min": 6.294533704931382e-06,
+      "clip_ratio/region_mean": 4.547691833067802e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15692.0,
+      "completions/mean_length": 7679.390625,
+      "completions/mean_terminated_length": 7099.08349609375,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "entropy": 1.0165777206420898,
+      "epoch": 0.35418583256669733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004624314606189728,
+      "learning_rate": 1e-05,
+      "loss": 0.0849,
+      "num_tokens": 337542492.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999251961708069,
+      "sampling/importance_sampling_ratio/min": 5.83546279813163e-05,
+      "sampling/sampling_logp_difference/max": 9.748971939086914,
+      "sampling/sampling_logp_difference/mean": 0.02206476218998432,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 6.00499606662197e-06,
+      "clip_ratio/high_mean": 1.5012490166554926e-06,
+      "clip_ratio/low_mean": 3.392923713363416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.543048615028965e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 5957.5859375,
+      "completions/mean_terminated_length": 5792.08740234375,
+      "completions/min_length": 1705.0,
+      "completions/min_terminated_length": 1705.0,
+      "entropy": 0.7705951780080795,
+      "epoch": 0.35510579576816925,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021966886706650257,
+      "learning_rate": 1e-05,
+      "loss": 0.0789,
+      "num_tokens": 338324279.0,
+      "reward": 0.53125,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999998927116394,
+      "sampling/importance_sampling_ratio/min": 0.0008041196851991117,
+      "sampling/sampling_logp_difference/max": 7.125762462615967,
+      "sampling/sampling_logp_difference/mean": 0.01804077997803688,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 1.5711350215497077e-05,
+      "clip_ratio/high_mean": 3.927837553874269e-06,
+      "clip_ratio/low_mean": 5.276240381135722e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.669024130838807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7269.8046875,
+      "completions/mean_terminated_length": 7198.03955078125,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 1.0025205165147781,
+      "epoch": 0.3560257589696412,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001694107661023736,
+      "learning_rate": 1e-05,
+      "loss": 0.134,
+      "num_tokens": 339274662.0,
+      "reward": 0.3359375,
+      "reward_std": 0.30487072467803955,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039769172668,
+      "sampling/importance_sampling_ratio/min": 0.0015677008777856827,
+      "sampling/sampling_logp_difference/max": 6.4581451416015625,
+      "sampling/sampling_logp_difference/mean": 0.021742526441812515,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 7.005848829066963e-06,
+      "clip_ratio/high_mean": 1.7514622072667407e-06,
+      "clip_ratio/low_mean": 5.100632029098051e-05,
+      "clip_ratio/low_min": 8.934973720897688e-06,
+      "clip_ratio/region_mean": 5.275778244140383e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7643.8359375,
+      "completions/mean_terminated_length": 7288.54443359375,
+      "completions/min_length": 1061.0,
+      "completions/min_terminated_length": 1061.0,
+      "entropy": 0.7936615869402885,
+      "epoch": 0.35694572217111314,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004587972536683083,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 340272689.0,
+      "reward": 0.5078125,
+      "reward_std": 0.35324612259864807,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999613761901855,
+      "sampling/importance_sampling_ratio/min": 0.0007390327518805861,
+      "sampling/sampling_logp_difference/max": 7.210168361663818,
+      "sampling/sampling_logp_difference/mean": 0.01862112432718277,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 1.0522736374696251e-05,
+      "clip_ratio/high_mean": 2.6306840936740628e-06,
+      "clip_ratio/low_mean": 2.139122614153166e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4021910121518886e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14401.0,
+      "completions/mean_length": 7068.734375,
+      "completions/mean_terminated_length": 6610.60595703125,
+      "completions/min_length": 775.0,
+      "completions/min_terminated_length": 775.0,
+      "entropy": 0.8858344480395317,
+      "epoch": 0.3578656853725851,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00245783943682909,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 341195599.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21594557166099548,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957263469696,
+      "sampling/importance_sampling_ratio/min": 1.526316918898374e-05,
+      "sampling/sampling_logp_difference/max": 11.090067863464355,
+      "sampling/sampling_logp_difference/mean": 0.019989900290966034,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 5.272259386401856e-06,
+      "clip_ratio/high_mean": 1.318064846600464e-06,
+      "clip_ratio/low_mean": 2.2939096254503966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4257160987417592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15788.0,
+      "completions/mean_length": 6093.296875,
+      "completions/mean_terminated_length": 5929.95263671875,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.9640207663178444,
+      "epoch": 0.35878564857405704,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0067657483741641045,
+      "learning_rate": 1e-05,
+      "loss": 0.0181,
+      "num_tokens": 341993565.0,
+      "reward": 0.4453125,
+      "reward_std": 0.12415502220392227,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998992681503296,
+      "sampling/importance_sampling_ratio/min": 0.010459281504154205,
+      "sampling/sampling_logp_difference/max": 4.56026554107666,
+      "sampling/sampling_logp_difference/mean": 0.02037961222231388,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.566248594528588e-05,
+      "clip_ratio/low_min": 4.402028480399167e-06,
+      "clip_ratio/region_mean": 4.566248594528588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16170.0,
+      "completions/max_terminated_length": 16170.0,
+      "completions/mean_length": 7620.09375,
+      "completions/mean_terminated_length": 7620.09375,
+      "completions/min_length": 1076.0,
+      "completions/min_terminated_length": 1076.0,
+      "entropy": 0.9773544892668724,
+      "epoch": 0.35970561177552896,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018817185191437602,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 342990545.0,
+      "reward": 0.3046875,
+      "reward_std": 0.18755048513412476,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0006883936002850533,
+      "sampling/sampling_logp_difference/max": 7.281149864196777,
+      "sampling/sampling_logp_difference/mean": 0.021528441458940506,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 2.6727505428425502e-05,
+      "clip_ratio/high_mean": 7.985045499481203e-06,
+      "clip_ratio/low_mean": 7.762144696243922e-05,
+      "clip_ratio/low_min": 2.4772080450929934e-05,
+      "clip_ratio/region_mean": 8.560649303035461e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15053.0,
+      "completions/mean_length": 6963.984375,
+      "completions/mean_terminated_length": 6737.904296875,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 0.9683744385838509,
+      "epoch": 0.36062557497700093,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052104732021689415,
+      "learning_rate": 1e-05,
+      "loss": 0.087,
+      "num_tokens": 343898791.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324679374695,
+      "sampling/importance_sampling_ratio/min": 0.010815954767167568,
+      "sampling/sampling_logp_difference/max": 4.526732921600342,
+      "sampling/sampling_logp_difference/mean": 0.021434593945741653,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 1.3545108686230378e-05,
+      "clip_ratio/high_mean": 4.365133804640209e-06,
+      "clip_ratio/low_mean": 2.5377692509209737e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9742826200163108e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15116.0,
+      "completions/mean_length": 6718.5078125,
+      "completions/mean_terminated_length": 6642.4013671875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9043834507465363,
+      "epoch": 0.36154553817847285,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005151392426341772,
+      "learning_rate": 1e-05,
+      "loss": 0.0085,
+      "num_tokens": 344779672.0,
+      "reward": 0.4921875,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999840497970581,
+      "sampling/importance_sampling_ratio/min": 0.0024171893019229174,
+      "sampling/sampling_logp_difference/max": 6.025149822235107,
+      "sampling/sampling_logp_difference/mean": 0.0201373603194952,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 1.2263486723895767e-05,
+      "clip_ratio/high_mean": 3.927679188109323e-06,
+      "clip_ratio/low_mean": 2.739263118201052e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132031042696326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16342.0,
+      "completions/mean_length": 7044.640625,
+      "completions/mean_terminated_length": 6820.49609375,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "entropy": 0.9017335474491119,
+      "epoch": 0.3624655013799448,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026606651954352856,
+      "learning_rate": 1e-05,
+      "loss": 0.0554,
+      "num_tokens": 345701722.0,
+      "reward": 0.3125,
+      "reward_std": 0.24146249890327454,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05,
+      "sampling/sampling_logp_difference/max": 10.157968521118164,
+      "sampling/sampling_logp_difference/mean": 0.01981864869594574,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 1.026556356009678e-05,
+      "clip_ratio/high_mean": 2.566390890024195e-06,
+      "clip_ratio/low_mean": 4.819571529424138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0762106297952414e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15476.0,
+      "completions/mean_length": 6031.875,
+      "completions/mean_terminated_length": 5950.3623046875,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.8537683561444283,
+      "epoch": 0.36338546458141674,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003957017324864864,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 346492810.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999707341194153,
+      "sampling/importance_sampling_ratio/min": 0.0015133036067709327,
+      "sampling/sampling_logp_difference/max": 6.493460178375244,
+      "sampling/sampling_logp_difference/mean": 0.018711457028985023,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 5.870488848813693e-06,
+      "clip_ratio/high_mean": 1.4676222122034233e-06,
+      "clip_ratio/low_mean": 3.637038832948747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.783801014378696e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15942.0,
+      "completions/mean_length": 7429.3515625,
+      "completions/mean_terminated_length": 6911.31396484375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.8821266070008278,
+      "epoch": 0.36430542778288866,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002122648525983095,
+      "learning_rate": 1e-05,
+      "loss": 0.1257,
+      "num_tokens": 347462871.0,
+      "reward": 0.453125,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000076293945312,
+      "sampling/importance_sampling_ratio/min": 0.00014005196862854064,
+      "sampling/sampling_logp_difference/max": 8.873497009277344,
+      "sampling/sampling_logp_difference/mean": 0.01998838409781456,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 1.0663932243915042e-05,
+      "clip_ratio/high_mean": 2.6659830609787605e-06,
+      "clip_ratio/low_mean": 6.443337406381033e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.709935701110226e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15761.0,
+      "completions/mean_length": 7131.7109375,
+      "completions/mean_terminated_length": 6833.25,
+      "completions/min_length": 821.0,
+      "completions/min_terminated_length": 821.0,
+      "entropy": 0.8575824722647667,
+      "epoch": 0.36522539098436063,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002546454081311822,
+      "learning_rate": 1e-05,
+      "loss": 0.0676,
+      "num_tokens": 348395842.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999964714050293,
+      "sampling/importance_sampling_ratio/min": 0.0002167800412280485,
+      "sampling/sampling_logp_difference/max": 8.436627388000488,
+      "sampling/sampling_logp_difference/mean": 0.0193922221660614,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 3.847337666229578e-06,
+      "clip_ratio/high_mean": 9.618344165573944e-07,
+      "clip_ratio/low_mean": 3.932982110654848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.029165563679271e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16200.0,
+      "completions/mean_length": 6858.34375,
+      "completions/mean_terminated_length": 6707.14306640625,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.9539813920855522,
+      "epoch": 0.36614535418583255,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00492837093770504,
+      "learning_rate": 1e-05,
+      "loss": 0.0818,
+      "num_tokens": 349292790.0,
+      "reward": 0.390625,
+      "reward_std": 0.1949220597743988,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998850226402283,
+      "sampling/importance_sampling_ratio/min": 0.0011153683299198747,
+      "sampling/sampling_logp_difference/max": 6.79857063293457,
+      "sampling/sampling_logp_difference/mean": 0.020318543538451195,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 1.291372609557584e-05,
+      "clip_ratio/high_mean": 3.22843152389396e-06,
+      "clip_ratio/low_mean": 3.8245348378040944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1473780811429606e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15261.0,
+      "completions/mean_length": 7809.984375,
+      "completions/mean_terminated_length": 7533.40283203125,
+      "completions/min_length": 1002.0,
+      "completions/min_terminated_length": 1002.0,
+      "entropy": 0.8353303670883179,
+      "epoch": 0.3670653173873045,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004895905964076519,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 350312556.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22567616403102875,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999260306358337,
+      "sampling/importance_sampling_ratio/min": 0.0008417933131568134,
+      "sampling/sampling_logp_difference/max": 7.0799760818481445,
+      "sampling/sampling_logp_difference/mean": 0.018754083663225174,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 1.1250081115576904e-05,
+      "clip_ratio/high_mean": 3.5690324011738994e-06,
+      "clip_ratio/low_mean": 3.196108968950284e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.553012152224255e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15057.0,
+      "completions/mean_length": 7194.9296875,
+      "completions/mean_terminated_length": 6821.39013671875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9744522422552109,
+      "epoch": 0.36798528058877644,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0032397822942584753,
+      "learning_rate": 1e-05,
+      "loss": 0.0402,
+      "num_tokens": 351252755.0,
+      "reward": 0.421875,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998766183853149,
+      "sampling/importance_sampling_ratio/min": 0.00023159870761446655,
+      "sampling/sampling_logp_difference/max": 8.370504379272461,
+      "sampling/sampling_logp_difference/mean": 0.02105094864964485,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 6.980455509619787e-06,
+      "clip_ratio/high_mean": 1.7451138774049468e-06,
+      "clip_ratio/low_mean": 2.2670621888210007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.441573599298863e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 6836.234375,
+      "completions/mean_terminated_length": 6607.08837890625,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.9149863049387932,
+      "epoch": 0.3689052437902484,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031576494220644236,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 352145873.0,
+      "reward": 0.3671875,
+      "reward_std": 0.22225630283355713,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999266862869263,
+      "sampling/importance_sampling_ratio/min": 0.0011975533561781049,
+      "sampling/sampling_logp_difference/max": 6.727474689483643,
+      "sampling/sampling_logp_difference/mean": 0.020445333793759346,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 2.3557336589874467e-05,
+      "clip_ratio/high_mean": 5.889334147468617e-06,
+      "clip_ratio/low_mean": 5.359988131203863e-05,
+      "clip_ratio/low_min": 1.3856095392839052e-05,
+      "clip_ratio/region_mean": 5.9489215118446737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 6942.65625,
+      "completions/mean_terminated_length": 6638.0966796875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "entropy": 0.7541583999991417,
+      "epoch": 0.36982520699172033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003970830701291561,
+      "learning_rate": 1e-05,
+      "loss": 0.051,
+      "num_tokens": 353056405.0,
+      "reward": 0.453125,
+      "reward_std": 0.3282659649848938,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 8.399576472584158e-06,
+      "sampling/sampling_logp_difference/max": 11.687329292297363,
+      "sampling/sampling_logp_difference/mean": 0.018101349472999573,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 2.6139805413549766e-05,
+      "clip_ratio/high_mean": 7.517377525800839e-06,
+      "clip_ratio/low_mean": 1.968103515537223e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7198412681173068e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14786.0,
+      "completions/max_terminated_length": 14786.0,
+      "completions/mean_length": 6022.1875,
+      "completions/mean_terminated_length": 6022.1875,
+      "completions/min_length": 1285.0,
+      "completions/min_terminated_length": 1285.0,
+      "entropy": 0.9535745903849602,
+      "epoch": 0.37074517019319225,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0043656788766384125,
+      "learning_rate": 1e-05,
+      "loss": 0.029,
+      "num_tokens": 353844661.0,
+      "reward": 0.4140625,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.04981832951307297,
+      "sampling/sampling_logp_difference/max": 2.9993722438812256,
+      "sampling/sampling_logp_difference/mean": 0.020655371248722076,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 9.152076700047473e-06,
+      "clip_ratio/high_mean": 2.9508817647183605e-06,
+      "clip_ratio/low_mean": 5.21388310517068e-05,
+      "clip_ratio/low_min": 2.633131089169183e-06,
+      "clip_ratio/region_mean": 5.508971298695542e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15906.0,
+      "completions/mean_length": 8068.96875,
+      "completions/mean_terminated_length": 7869.408203125,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.9473539590835571,
+      "epoch": 0.3716651333946642,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006543307099491358,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 354894689.0,
+      "reward": 0.2578125,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999514818191528,
+      "sampling/importance_sampling_ratio/min": 6.672408926533535e-05,
+      "sampling/sampling_logp_difference/max": 9.614944458007812,
+      "sampling/sampling_logp_difference/mean": 0.021852033212780952,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 2.9619268843816826e-05,
+      "clip_ratio/high_mean": 7.4048172109542065e-06,
+      "clip_ratio/low_mean": 5.5152235972855124e-05,
+      "clip_ratio/low_min": 1.0455875781190116e-05,
+      "clip_ratio/region_mean": 6.255705375224352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15748.0,
+      "completions/mean_length": 5960.1875,
+      "completions/mean_terminated_length": 5878.1103515625,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "entropy": 0.9564141109585762,
+      "epoch": 0.37258509659613614,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003351036459207535,
+      "learning_rate": 1e-05,
+      "loss": 0.0293,
+      "num_tokens": 355677273.0,
+      "reward": 0.46875,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999220371246338,
+      "sampling/importance_sampling_ratio/min": 0.0012859756825491786,
+      "sampling/sampling_logp_difference/max": 6.656237602233887,
+      "sampling/sampling_logp_difference/mean": 0.021779976785182953,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 7.957685966175632e-06,
+      "clip_ratio/high_mean": 1.989421491543908e-06,
+      "clip_ratio/low_mean": 3.758041248147492e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.956983414354909e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15669.0,
+      "completions/mean_length": 7620.21875,
+      "completions/mean_terminated_length": 7189.212890625,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 1.035948596894741,
+      "epoch": 0.3735050597976081,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031219006050378084,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 356675829.0,
+      "reward": 0.296875,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001060962677002,
+      "sampling/importance_sampling_ratio/min": 0.010141897015273571,
+      "sampling/sampling_logp_difference/max": 4.591080188751221,
+      "sampling/sampling_logp_difference/mean": 0.021951109170913696,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 2.286768199155631e-05,
+      "clip_ratio/high_mean": 5.7169204978890775e-06,
+      "clip_ratio/low_mean": 3.914574369900947e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.486266482217616e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14038.0,
+      "completions/mean_length": 5806.0234375,
+      "completions/mean_terminated_length": 5638.119140625,
+      "completions/min_length": 1319.0,
+      "completions/min_terminated_length": 1319.0,
+      "entropy": 0.8977029845118523,
+      "epoch": 0.37442502299908004,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002810312667861581,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 357438712.0,
+      "reward": 0.546875,
+      "reward_std": 0.22832970321178436,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999280571937561,
+      "sampling/importance_sampling_ratio/min": 0.0011738575994968414,
+      "sampling/sampling_logp_difference/max": 6.747459888458252,
+      "sampling/sampling_logp_difference/mean": 0.01965375244617462,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 1.2219379641464911e-05,
+      "clip_ratio/high_mean": 3.054844910366228e-06,
+      "clip_ratio/low_mean": 3.186109779562685e-05,
+      "clip_ratio/low_min": 4.3511558942554984e-06,
+      "clip_ratio/region_mean": 3.4915943160740426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15705.0,
+      "completions/max_terminated_length": 15705.0,
+      "completions/mean_length": 6537.4609375,
+      "completions/mean_terminated_length": 6537.4609375,
+      "completions/min_length": 842.0,
+      "completions/min_terminated_length": 842.0,
+      "entropy": 0.9577726796269417,
+      "epoch": 0.37534498620055196,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004516562446951866,
+      "learning_rate": 1e-05,
+      "loss": 0.0517,
+      "num_tokens": 358296731.0,
+      "reward": 0.3828125,
+      "reward_std": 0.1830746978521347,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999170303344727,
+      "sampling/importance_sampling_ratio/min": 2.384942035860149e-06,
+      "sampling/sampling_logp_difference/max": 12.946335792541504,
+      "sampling/sampling_logp_difference/mean": 0.021242395043373108,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 1.4422689218918094e-05,
+      "clip_ratio/high_mean": 3.6056723047295236e-06,
+      "clip_ratio/low_mean": 3.026239573955536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3868068385345396e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16360.0,
+      "completions/mean_length": 7896.671875,
+      "completions/mean_terminated_length": 7622.88671875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.9163230583071709,
+      "epoch": 0.37626494940202393,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003542230697348714,
+      "learning_rate": 1e-05,
+      "loss": 0.05,
+      "num_tokens": 359327001.0,
+      "reward": 0.375,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998560547828674,
+      "sampling/importance_sampling_ratio/min": 0.00010891625424847007,
+      "sampling/sampling_logp_difference/max": 9.124931335449219,
+      "sampling/sampling_logp_difference/mean": 0.020085681229829788,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 1.7827243254942005e-05,
+      "clip_ratio/high_mean": 5.474494003010477e-06,
+      "clip_ratio/low_mean": 4.2465159026505717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.793965263161226e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15297.0,
+      "completions/mean_length": 6728.7109375,
+      "completions/mean_terminated_length": 6652.68505859375,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9010183215141296,
+      "epoch": 0.37718491260349585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0035069347359240055,
+      "learning_rate": 1e-05,
+      "loss": 0.0518,
+      "num_tokens": 360208780.0,
+      "reward": 0.5390625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999571442604065,
+      "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05,
+      "sampling/sampling_logp_difference/max": 11.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.021022530272603035,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 1.0376989393989788e-05,
+      "clip_ratio/high_mean": 2.594247348497447e-06,
+      "clip_ratio/low_mean": 2.8587513156708155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1181759936771414e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6800.3984375,
+      "completions/mean_terminated_length": 6491.25,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.8654960840940475,
+      "epoch": 0.3781048758049678,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033910400234162807,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 361098567.0,
+      "reward": 0.5625,
+      "reward_std": 0.2306838035583496,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998576641082764,
+      "sampling/importance_sampling_ratio/min": 0.001449413481168449,
+      "sampling/sampling_logp_difference/max": 6.536596298217773,
+      "sampling/sampling_logp_difference/mean": 0.019660964608192444,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 2.3068858354236e-05,
+      "clip_ratio/high_mean": 7.792090059410839e-06,
+      "clip_ratio/low_mean": 5.8515578757578623e-05,
+      "clip_ratio/low_min": 1.0348648629587842e-05,
+      "clip_ratio/region_mean": 6.630766870330262e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7103.4453125,
+      "completions/mean_terminated_length": 6956.13525390625,
+      "completions/min_length": 1711.0,
+      "completions/min_terminated_length": 1711.0,
+      "entropy": 0.8317076042294502,
+      "epoch": 0.37902483900643974,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036110079381614923,
+      "learning_rate": 1e-05,
+      "loss": 0.0834,
+      "num_tokens": 362027520.0,
+      "reward": 0.546875,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999338984489441,
+      "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05,
+      "sampling/sampling_logp_difference/max": 11.458046913146973,
+      "sampling/sampling_logp_difference/mean": 0.01939362846314907,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 3.112394779236638e-06,
+      "clip_ratio/high_mean": 7.780986948091595e-07,
+      "clip_ratio/low_mean": 5.127149995587388e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.204959859383962e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15830.0,
+      "completions/mean_length": 7344.9296875,
+      "completions/mean_terminated_length": 6900.384765625,
+      "completions/min_length": 1368.0,
+      "completions/min_terminated_length": 1368.0,
+      "entropy": 0.8387318029999733,
+      "epoch": 0.37994480220791166,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002141098491847515,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 362985207.0,
+      "reward": 0.34375,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999322891235352,
+      "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05,
+      "sampling/sampling_logp_difference/max": 10.874617576599121,
+      "sampling/sampling_logp_difference/mean": 0.01929464004933834,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 5.2602786126954015e-06,
+      "clip_ratio/high_mean": 1.3150696531738504e-06,
+      "clip_ratio/low_mean": 1.7854434247510653e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9169503786997666e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16137.0,
+      "completions/mean_length": 6377.7734375,
+      "completions/mean_terminated_length": 6218.94482421875,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9732858911156654,
+      "epoch": 0.38086476540938363,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015244127716869116,
+      "learning_rate": 1e-05,
+      "loss": 0.0608,
+      "num_tokens": 363823914.0,
+      "reward": 0.4375,
+      "reward_std": 0.1988610327243805,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 0.006335465237498283,
+      "sampling/sampling_logp_difference/max": 5.061592102050781,
+      "sampling/sampling_logp_difference/mean": 0.020688029006123543,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 2.6195500595349586e-05,
+      "clip_ratio/high_mean": 6.548875148837396e-06,
+      "clip_ratio/low_mean": 3.3802934012783226e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035180882056011e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14456.0,
+      "completions/mean_length": 5599.7890625,
+      "completions/mean_terminated_length": 5340.96826171875,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8872368410229683,
+      "epoch": 0.38178472861085555,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002647512126713991,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 364561127.0,
+      "reward": 0.453125,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999077916145325,
+      "sampling/importance_sampling_ratio/min": 2.370526999584399e-06,
+      "sampling/sampling_logp_difference/max": 12.952398300170898,
+      "sampling/sampling_logp_difference/mean": 0.01878243312239647,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 2.157278959202813e-05,
+      "clip_ratio/high_mean": 5.3931973980070325e-06,
+      "clip_ratio/low_mean": 7.215861739950924e-05,
+      "clip_ratio/low_min": 1.4898997051204788e-05,
+      "clip_ratio/region_mean": 7.755181559332414e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15905.0,
+      "completions/mean_length": 7877.2890625,
+      "completions/mean_terminated_length": 7385.1650390625,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.8416353687644005,
+      "epoch": 0.3827046918123275,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018051012884825468,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 365590124.0,
+      "reward": 0.3125,
+      "reward_std": 0.28407180309295654,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.0004095165350008756,
+      "sampling/sampling_logp_difference/max": 7.800533294677734,
+      "sampling/sampling_logp_difference/mean": 0.019809434190392494,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 2.540994637456606e-05,
+      "clip_ratio/high_mean": 6.352486593641515e-06,
+      "clip_ratio/low_mean": 4.230594890941575e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8658435844117776e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16083.0,
+      "completions/mean_length": 6836.7890625,
+      "completions/mean_terminated_length": 6200.30859375,
+      "completions/min_length": 909.0,
+      "completions/min_terminated_length": 909.0,
+      "entropy": 0.8647575601935387,
+      "epoch": 0.38362465501379944,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004550795070827007,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 366486337.0,
+      "reward": 0.40625,
+      "reward_std": 0.22620806097984314,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873638153076,
+      "sampling/importance_sampling_ratio/min": 0.0001089095021598041,
+      "sampling/sampling_logp_difference/max": 9.124993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01992485672235489,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 1.1592664577619871e-05,
+      "clip_ratio/high_mean": 2.8981661444049678e-06,
+      "clip_ratio/low_mean": 3.5717548257707676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.861571451579948e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16286.0,
+      "completions/mean_length": 6884.953125,
+      "completions/mean_terminated_length": 6417.78662109375,
+      "completions/min_length": 1289.0,
+      "completions/min_terminated_length": 1289.0,
+      "entropy": 0.8691708743572235,
+      "epoch": 0.3845446182152714,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005958946421742439,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 367386163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000011920928955,
+      "sampling/importance_sampling_ratio/min": 9.519772902422119e-06,
+      "sampling/sampling_logp_difference/max": 11.562139511108398,
+      "sampling/sampling_logp_difference/mean": 0.019436441361904144,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 2.7658640192385064e-05,
+      "clip_ratio/high_mean": 8.455849524580117e-06,
+      "clip_ratio/low_mean": 3.938097847822064e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7836828116487595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15574.0,
+      "completions/mean_length": 7439.1328125,
+      "completions/mean_terminated_length": 7150.58837890625,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 0.795464999973774,
+      "epoch": 0.38546458141674333,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00558120384812355,
+      "learning_rate": 1e-05,
+      "loss": 0.1918,
+      "num_tokens": 368357500.0,
+      "reward": 0.609375,
+      "reward_std": 0.3795146346092224,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.0001159337698481977,
+      "sampling/sampling_logp_difference/max": 9.062491416931152,
+      "sampling/sampling_logp_difference/mean": 0.018824251368641853,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 8.509555527780321e-06,
+      "clip_ratio/high_mean": 2.1273888819450804e-06,
+      "clip_ratio/low_mean": 3.0958593640662e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.308598269313734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16236.0,
+      "completions/mean_length": 6751.53125,
+      "completions/mean_terminated_length": 6520.3525390625,
+      "completions/min_length": 715.0,
+      "completions/min_terminated_length": 715.0,
+      "entropy": 0.9450879693031311,
+      "epoch": 0.38638454461821525,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004628168884664774,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 369242920.0,
+      "reward": 0.359375,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999655485153198,
+      "sampling/importance_sampling_ratio/min": 0.0006074689445085824,
+      "sampling/sampling_logp_difference/max": 7.406209468841553,
+      "sampling/sampling_logp_difference/mean": 0.019376013427972794,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 1.8288420505996328e-05,
+      "clip_ratio/high_mean": 4.572105126499082e-06,
+      "clip_ratio/low_mean": 4.86290555272717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320115997164976e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16164.0,
+      "completions/mean_length": 7023.296875,
+      "completions/mean_terminated_length": 6315.3447265625,
+      "completions/min_length": 1628.0,
+      "completions/min_terminated_length": 1628.0,
+      "entropy": 0.7378111630678177,
+      "epoch": 0.3873045078196872,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00389425759203732,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 370159510.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999127388000488,
+      "sampling/importance_sampling_ratio/min": 0.00014012664905749261,
+      "sampling/sampling_logp_difference/max": 8.872963905334473,
+      "sampling/sampling_logp_difference/mean": 0.016914553940296173,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 2.1269573153404053e-05,
+      "clip_ratio/high_mean": 5.948400371380558e-06,
+      "clip_ratio/low_mean": 2.3538930747690756e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9487331687505502e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16018.0,
+      "completions/max_terminated_length": 16018.0,
+      "completions/mean_length": 7702.3046875,
+      "completions/mean_terminated_length": 7702.3046875,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "entropy": 0.9053447172045708,
+      "epoch": 0.38822447102115915,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004324545152485371,
+      "learning_rate": 1e-05,
+      "loss": 0.0149,
+      "num_tokens": 371162773.0,
+      "reward": 0.2421875,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00001060962677,
+      "sampling/importance_sampling_ratio/min": 2.283278627146501e-05,
+      "sampling/sampling_logp_difference/max": 10.687313079833984,
+      "sampling/sampling_logp_difference/mean": 0.020495830103754997,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 1.0294916819475475e-05,
+      "clip_ratio/high_mean": 2.5737292048688687e-06,
+      "clip_ratio/low_mean": 5.831611520079605e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.088984559937671e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 6904.78125,
+      "completions/mean_terminated_length": 6754.31787109375,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.7991176024079323,
+      "epoch": 0.3891444342226311,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003239463549107313,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "num_tokens": 372067241.0,
+      "reward": 0.328125,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00012340991816017777,
+      "sampling/sampling_logp_difference/max": 8.999999046325684,
+      "sampling/sampling_logp_difference/mean": 0.019042208790779114,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 2.7261318791715894e-05,
+      "clip_ratio/high_mean": 7.926559305815317e-06,
+      "clip_ratio/low_mean": 1.552133551285806e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3447895273420727e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15399.0,
+      "completions/mean_length": 6107.7421875,
+      "completions/mean_terminated_length": 5602.35205078125,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "entropy": 0.9495253190398216,
+      "epoch": 0.39006439742410304,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015464330790564418,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 372866072.0,
+      "reward": 0.421875,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971330165863,
+      "sampling/importance_sampling_ratio/min": 0.00024684349773451686,
+      "sampling/sampling_logp_difference/max": 8.306756019592285,
+      "sampling/sampling_logp_difference/mean": 0.019793221727013588,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 2.457227401464479e-05,
+      "clip_ratio/high_mean": 8.533324717063806e-06,
+      "clip_ratio/low_mean": 3.261690835643094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.115023284612107e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15939.0,
+      "completions/mean_length": 6079.8046875,
+      "completions/mean_terminated_length": 5747.4111328125,
+      "completions/min_length": 1082.0,
+      "completions/min_terminated_length": 1082.0,
+      "entropy": 0.8005363270640373,
+      "epoch": 0.39098436062557496,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024811832699924707,
+      "learning_rate": 1e-05,
+      "loss": 0.1124,
+      "num_tokens": 373663463.0,
+      "reward": 0.625,
+      "reward_std": 0.2630355656147003,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743103981018,
+      "sampling/importance_sampling_ratio/min": 0.00019348970090504736,
+      "sampling/sampling_logp_difference/max": 8.550286293029785,
+      "sampling/sampling_logp_difference/mean": 0.017151469364762306,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 3.3719989005476236e-06,
+      "clip_ratio/high_mean": 8.429997251369059e-07,
+      "clip_ratio/low_mean": 2.132218082806503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2165180553201935e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14925.0,
+      "completions/mean_length": 6453.7890625,
+      "completions/mean_terminated_length": 6375.5986328125,
+      "completions/min_length": 347.0,
+      "completions/min_terminated_length": 347.0,
+      "entropy": 0.9212624430656433,
+      "epoch": 0.39190432382704693,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031475063879042864,
+      "learning_rate": 1e-05,
+      "loss": 0.0959,
+      "num_tokens": 374517492.0,
+      "reward": 0.34375,
+      "reward_std": 0.19910329580307007,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999594688415527,
+      "sampling/importance_sampling_ratio/min": 0.015664709731936455,
+      "sampling/sampling_logp_difference/max": 4.156344890594482,
+      "sampling/sampling_logp_difference/mean": 0.019899867475032806,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 1.907509408738406e-05,
+      "clip_ratio/high_mean": 5.984868664654641e-06,
+      "clip_ratio/low_mean": 3.784128080042137e-05,
+      "clip_ratio/low_min": 3.7751804029539926e-06,
+      "clip_ratio/region_mean": 4.382614952191943e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16159.0,
+      "completions/max_terminated_length": 16159.0,
+      "completions/mean_length": 6126.9921875,
+      "completions/mean_terminated_length": 6126.9921875,
+      "completions/min_length": 1106.0,
+      "completions/min_terminated_length": 1106.0,
+      "entropy": 0.8252849578857422,
+      "epoch": 0.39282428702851885,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004200868774205446,
+      "learning_rate": 1e-05,
+      "loss": 0.0276,
+      "num_tokens": 375320339.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999815225601196,
+      "sampling/importance_sampling_ratio/min": 0.005763276945799589,
+      "sampling/sampling_logp_difference/max": 5.156249046325684,
+      "sampling/sampling_logp_difference/mean": 0.01833093911409378,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 1.8918785372079583e-05,
+      "clip_ratio/high_mean": 5.476571459439583e-06,
+      "clip_ratio/low_mean": 6.169724406390742e-05,
+      "clip_ratio/low_min": 7.494657666029525e-06,
+      "clip_ratio/region_mean": 6.717381506859965e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15411.0,
+      "completions/mean_length": 6739.09375,
+      "completions/mean_terminated_length": 6427.9677734375,
+      "completions/min_length": 1228.0,
+      "completions/min_terminated_length": 1228.0,
+      "entropy": 0.8008574098348618,
+      "epoch": 0.3937442502299908,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003204014617949724,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 376201015.0,
+      "reward": 0.5390625,
+      "reward_std": 0.37086254358291626,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998303651809692,
+      "sampling/importance_sampling_ratio/min": 0.00010144581028725952,
+      "sampling/sampling_logp_difference/max": 9.195985794067383,
+      "sampling/sampling_logp_difference/mean": 0.018961725756525993,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 1.3558789078160771e-05,
+      "clip_ratio/high_mean": 3.389697269540193e-06,
+      "clip_ratio/low_mean": 5.3925050679026754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.731474743697618e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15634.0,
+      "completions/mean_length": 7245.8984375,
+      "completions/mean_terminated_length": 6951.12060546875,
+      "completions/min_length": 1306.0,
+      "completions/min_terminated_length": 1306.0,
+      "entropy": 1.0351596996188164,
+      "epoch": 0.39466421343146274,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0039763906970620155,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 377149650.0,
+      "reward": 0.375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000600814819336,
+      "sampling/importance_sampling_ratio/min": 8.106228051474318e-05,
+      "sampling/sampling_logp_difference/max": 9.420292854309082,
+      "sampling/sampling_logp_difference/mean": 0.020948028191924095,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 1.4580486549675697e-05,
+      "clip_ratio/high_mean": 4.259903903403028e-06,
+      "clip_ratio/low_mean": 4.6149686397711775e-05,
+      "clip_ratio/low_min": 3.006686938533676e-06,
+      "clip_ratio/region_mean": 5.04095905853319e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 6958.625,
+      "completions/mean_terminated_length": 6495.08154296875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.8360240310430527,
+      "epoch": 0.39558417663293466,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0031417158897966146,
+      "learning_rate": 1e-05,
+      "loss": 0.0195,
+      "num_tokens": 378057802.0,
+      "reward": 0.515625,
+      "reward_std": 0.35771697759628296,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999384880065918,
+      "sampling/importance_sampling_ratio/min": 0.00010235882655251771,
+      "sampling/sampling_logp_difference/max": 9.187026023864746,
+      "sampling/sampling_logp_difference/mean": 0.019185224547982216,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 6.681633749394678e-06,
+      "clip_ratio/high_mean": 1.6704084373486694e-06,
+      "clip_ratio/low_mean": 5.096616632727091e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.263657521936693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15410.0,
+      "completions/max_terminated_length": 15410.0,
+      "completions/mean_length": 5696.3984375,
+      "completions/mean_terminated_length": 5696.3984375,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.7887749597430229,
+      "epoch": 0.39650413983440663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004943124484270811,
+      "learning_rate": 1e-05,
+      "loss": 0.096,
+      "num_tokens": 378808021.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999057054519653,
+      "sampling/importance_sampling_ratio/min": 0.0015042300801724195,
+      "sampling/sampling_logp_difference/max": 6.499474048614502,
+      "sampling/sampling_logp_difference/mean": 0.018845941871404648,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 1.7526824194646906e-05,
+      "clip_ratio/high_mean": 5.417880970526312e-06,
+      "clip_ratio/low_mean": 3.513921649300755e-05,
+      "clip_ratio/low_min": 6.075038982089609e-06,
+      "clip_ratio/region_mean": 4.0557096895099676e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14233.0,
+      "completions/mean_length": 6480.8828125,
+      "completions/mean_terminated_length": 6323.69091796875,
+      "completions/min_length": 1013.0,
+      "completions/min_terminated_length": 1013.0,
+      "entropy": 0.8796411231160164,
+      "epoch": 0.39742410303587855,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00595651101320982,
+      "learning_rate": 1e-05,
+      "loss": 0.0546,
+      "num_tokens": 379659710.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 0.0017907419241964817,
+      "sampling/sampling_logp_difference/max": 6.325125217437744,
+      "sampling/sampling_logp_difference/mean": 0.01906527951359749,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4512424602107785e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4512424602107785e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7501.703125,
+      "completions/mean_terminated_length": 6829.93310546875,
+      "completions/min_length": 680.0,
+      "completions/min_terminated_length": 680.0,
+      "entropy": 0.786028303205967,
+      "epoch": 0.3983440662373505,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0024527597706764936,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 380640720.0,
+      "reward": 0.5234375,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999595880508423,
+      "sampling/importance_sampling_ratio/min": 8.851602615322918e-07,
+      "sampling/sampling_logp_difference/max": 13.93749713897705,
+      "sampling/sampling_logp_difference/mean": 0.01873261108994484,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 1.4606259583160863e-05,
+      "clip_ratio/high_mean": 5.505394312876888e-06,
+      "clip_ratio/low_mean": 3.1679782978244475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7185177234277944e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15185.0,
+      "completions/mean_length": 5619.2890625,
+      "completions/mean_terminated_length": 5448.4208984375,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.8098893761634827,
+      "epoch": 0.39926402943882244,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004280989523977041,
+      "learning_rate": 1e-05,
+      "loss": 0.0514,
+      "num_tokens": 381377981.0,
+      "reward": 0.609375,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999443292617798,
+      "sampling/importance_sampling_ratio/min": 0.0010248658945783973,
+      "sampling/sampling_logp_difference/max": 6.883193492889404,
+      "sampling/sampling_logp_difference/mean": 0.017923470586538315,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 1.4808703554081148e-05,
+      "clip_ratio/high_mean": 3.702175888520287e-06,
+      "clip_ratio/low_mean": 2.3637440563106793e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7339616224253405e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5243.8203125,
+      "completions/mean_terminated_length": 5156.1025390625,
+      "completions/min_length": 576.0,
+      "completions/min_terminated_length": 576.0,
+      "entropy": 0.7485036551952362,
+      "epoch": 0.40018399264029436,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004721642471849918,
+      "learning_rate": 1e-05,
+      "loss": 0.0877,
+      "num_tokens": 382070478.0,
+      "reward": 0.6875,
+      "reward_std": 0.26538965106010437,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999414086341858,
+      "sampling/importance_sampling_ratio/min": 0.0011518355458974838,
+      "sampling/sampling_logp_difference/max": 6.7663984298706055,
+      "sampling/sampling_logp_difference/mean": 0.016579966992139816,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 3.1177480195765384e-05,
+      "clip_ratio/high_mean": 1.1174359769938746e-05,
+      "clip_ratio/low_mean": 3.602651599976525e-05,
+      "clip_ratio/low_min": 4.348733455117326e-06,
+      "clip_ratio/region_mean": 4.720087713394605e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15978.0,
+      "completions/mean_length": 7021.1796875,
+      "completions/mean_terminated_length": 6872.56396484375,
+      "completions/min_length": 1371.0,
+      "completions/min_terminated_length": 1371.0,
+      "entropy": 0.8693460151553154,
+      "epoch": 0.40110395584176634,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00329192029312253,
+      "learning_rate": 1e-05,
+      "loss": 0.0342,
+      "num_tokens": 382990245.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999822378158569,
+      "sampling/importance_sampling_ratio/min": 0.0023386883549392223,
+      "sampling/sampling_logp_difference/max": 6.058165073394775,
+      "sampling/sampling_logp_difference/mean": 0.019863136112689972,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 1.1192694955752813e-05,
+      "clip_ratio/high_mean": 2.7981737389382033e-06,
+      "clip_ratio/low_mean": 4.9078003257818636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.1876177280973934e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15344.0,
+      "completions/mean_length": 6917.625,
+      "completions/mean_terminated_length": 6452.0654296875,
+      "completions/min_length": 945.0,
+      "completions/min_terminated_length": 945.0,
+      "entropy": 0.8466897681355476,
+      "epoch": 0.40202391904323825,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0051889242604374886,
+      "learning_rate": 1e-05,
+      "loss": 0.1009,
+      "num_tokens": 383896717.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999983310699463,
+      "sampling/importance_sampling_ratio/min": 0.00015846389578655362,
+      "sampling/sampling_logp_difference/max": 8.749983787536621,
+      "sampling/sampling_logp_difference/mean": 0.019528398290276527,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 2.3224948108691024e-05,
+      "clip_ratio/high_mean": 8.263948757303297e-06,
+      "clip_ratio/low_mean": 3.8556312347282073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.682026019509067e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16175.0,
+      "completions/mean_length": 7487.5078125,
+      "completions/mean_terminated_length": 7346.2939453125,
+      "completions/min_length": 877.0,
+      "completions/min_terminated_length": 877.0,
+      "entropy": 0.9584660083055496,
+      "epoch": 0.4029438822447102,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002855573548004031,
+      "learning_rate": 1e-05,
+      "loss": 0.0087,
+      "num_tokens": 384872622.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2477683424949646,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999386668205261,
+      "sampling/importance_sampling_ratio/min": 0.0038593418430536985,
+      "sampling/sampling_logp_difference/max": 5.557258605957031,
+      "sampling/sampling_logp_difference/mean": 0.0209865253418684,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 6.171620498207631e-06,
+      "clip_ratio/high_mean": 1.5429051245519076e-06,
+      "clip_ratio/low_mean": 2.98128834401723e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.135578845103737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16092.0,
+      "completions/mean_length": 6637.5078125,
+      "completions/mean_terminated_length": 6323.1044921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 0.8841215297579765,
+      "epoch": 0.40386384544618215,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004437311552464962,
+      "learning_rate": 1e-05,
+      "loss": 0.0523,
+      "num_tokens": 385744023.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999136924743652,
+      "sampling/importance_sampling_ratio/min": 0.002925124252215028,
+      "sampling/sampling_logp_difference/max": 5.834418296813965,
+      "sampling/sampling_logp_difference/mean": 0.019490888342261314,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 1.3304874300956726e-05,
+      "clip_ratio/high_mean": 3.3262185752391815e-06,
+      "clip_ratio/low_mean": 5.443932013804442e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.776553894065728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15143.0,
+      "completions/mean_length": 5965.9765625,
+      "completions/mean_terminated_length": 5800.611328125,
+      "completions/min_length": 621.0,
+      "completions/min_terminated_length": 621.0,
+      "entropy": 0.8726934269070625,
+      "epoch": 0.4047838086476541,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002463799435645342,
+      "learning_rate": 1e-05,
+      "loss": -0.0075,
+      "num_tokens": 386525492.0,
+      "reward": 0.3984375,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.00020367901015561074,
+      "sampling/sampling_logp_difference/max": 8.4989652633667,
+      "sampling/sampling_logp_difference/mean": 0.01946769654750824,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 1.0084711902891286e-05,
+      "clip_ratio/high_mean": 3.6154040117253317e-06,
+      "clip_ratio/low_mean": 3.598771945689805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9603123695997056e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6693.109375,
+      "completions/mean_terminated_length": 6616.80322265625,
+      "completions/min_length": 1704.0,
+      "completions/min_terminated_length": 1704.0,
+      "entropy": 0.9430640190839767,
+      "epoch": 0.40570377184912604,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038990566972643137,
+      "learning_rate": 1e-05,
+      "loss": 0.0415,
+      "num_tokens": 387404842.0,
+      "reward": 0.421875,
+      "reward_std": 0.31587693095207214,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999700784683228,
+      "sampling/importance_sampling_ratio/min": 0.0011708902893587947,
+      "sampling/sampling_logp_difference/max": 6.749990940093994,
+      "sampling/sampling_logp_difference/mean": 0.020848294720053673,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 7.462686426151777e-06,
+      "clip_ratio/high_mean": 1.8656716065379442e-06,
+      "clip_ratio/low_mean": 5.234285907818048e-05,
+      "clip_ratio/low_min": 4.47803950009984e-06,
+      "clip_ratio/region_mean": 5.420853057103159e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7045.6953125,
+      "completions/mean_terminated_length": 6505.46240234375,
+      "completions/min_length": 926.0,
+      "completions/min_terminated_length": 926.0,
+      "entropy": 0.8912066072225571,
+      "epoch": 0.40662373505059796,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018510994268581271,
+      "learning_rate": 1e-05,
+      "loss": 0.099,
+      "num_tokens": 388324475.0,
+      "reward": 0.40625,
+      "reward_std": 0.32195523381233215,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999024868011475,
+      "sampling/importance_sampling_ratio/min": 0.0031757301185280085,
+      "sampling/sampling_logp_difference/max": 5.752217769622803,
+      "sampling/sampling_logp_difference/mean": 0.020547039806842804,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 2.504527083146968e-05,
+      "clip_ratio/high_mean": 6.26131770786742e-06,
+      "clip_ratio/low_mean": 6.165269871871715e-05,
+      "clip_ratio/low_min": 3.5272871627967106e-06,
+      "clip_ratio/region_mean": 6.791401551708987e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15734.0,
+      "completions/mean_length": 7480.0078125,
+      "completions/mean_terminated_length": 7266.3125,
+      "completions/min_length": 1130.0,
+      "completions/min_terminated_length": 1130.0,
+      "entropy": 0.8813760280609131,
+      "epoch": 0.40754369825206993,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004439481534063816,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 389305644.0,
+      "reward": 0.34375,
+      "reward_std": 0.31300368905067444,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999762773513794,
+      "sampling/importance_sampling_ratio/min": 0.007449973840266466,
+      "sampling/sampling_logp_difference/max": 4.899544715881348,
+      "sampling/sampling_logp_difference/mean": 0.01973455585539341,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 4.0980917219712865e-06,
+      "clip_ratio/high_mean": 1.0245229304928216e-06,
+      "clip_ratio/low_mean": 3.662567087303614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.76501939172158e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15302.0,
+      "completions/max_terminated_length": 15302.0,
+      "completions/mean_length": 7044.4453125,
+      "completions/mean_terminated_length": 7044.4453125,
+      "completions/min_length": 1229.0,
+      "completions/min_terminated_length": 1229.0,
+      "entropy": 0.9901906549930573,
+      "epoch": 0.40846366145354185,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.004181519150733948,
+      "learning_rate": 1e-05,
+      "loss": -0.0068,
+      "num_tokens": 390229373.0,
+      "reward": 0.421875,
+      "reward_std": 0.17700131237506866,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000314712524414,
+      "sampling/importance_sampling_ratio/min": 0.00022536676260642707,
+      "sampling/sampling_logp_difference/max": 8.397781372070312,
+      "sampling/sampling_logp_difference/mean": 0.021211043000221252,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 1.4909872106727562e-05,
+      "clip_ratio/high_mean": 3.7274680266818905e-06,
+      "clip_ratio/low_mean": 5.29995777469594e-05,
+      "clip_ratio/low_min": 3.708758640641463e-06,
+      "clip_ratio/region_mean": 5.672704537573736e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7815.8125,
+      "completions/mean_terminated_length": 7244.6005859375,
+      "completions/min_length": 1350.0,
+      "completions/min_terminated_length": 1350.0,
+      "entropy": 0.8278292864561081,
+      "epoch": 0.4093836246550138,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002691390924155712,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 391251141.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31222954392433167,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 0.007715471088886261,
+      "sampling/sampling_logp_difference/max": 4.864527702331543,
+      "sampling/sampling_logp_difference/mean": 0.018415704369544983,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 2.1858722902834415e-05,
+      "clip_ratio/high_mean": 6.629899417021079e-06,
+      "clip_ratio/low_mean": 3.196247394043894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.859237290271267e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15202.0,
+      "completions/mean_length": 5305.1796875,
+      "completions/mean_terminated_length": 5217.94482421875,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.8100772425532341,
+      "epoch": 0.41030358785648574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0069543467834591866,
+      "learning_rate": 1e-05,
+      "loss": 0.1153,
+      "num_tokens": 391956196.0,
+      "reward": 0.609375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000190734863281,
+      "sampling/importance_sampling_ratio/min": 0.0024869756307452917,
+      "sampling/sampling_logp_difference/max": 5.996687889099121,
+      "sampling/sampling_logp_difference/mean": 0.017318082973361015,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 2.461934036546154e-05,
+      "clip_ratio/high_mean": 8.056288947955181e-06,
+      "clip_ratio/low_mean": 5.289376917971822e-05,
+      "clip_ratio/low_min": 4.21926688431995e-06,
+      "clip_ratio/region_mean": 6.0950058468733914e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15300.0,
+      "completions/mean_length": 7299.578125,
+      "completions/mean_terminated_length": 6930.29248046875,
+      "completions/min_length": 1008.0,
+      "completions/min_terminated_length": 1008.0,
+      "entropy": 0.9955824315547943,
+      "epoch": 0.41122355105795766,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0065611582249403,
+      "learning_rate": 1e-05,
+      "loss": 0.0883,
+      "num_tokens": 392908430.0,
+      "reward": 0.4375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999696016311646,
+      "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06,
+      "sampling/sampling_logp_difference/max": 11.873339653015137,
+      "sampling/sampling_logp_difference/mean": 0.02127375639975071,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 2.4339562514796853e-05,
+      "clip_ratio/high_mean": 7.412756531266496e-06,
+      "clip_ratio/low_mean": 3.89272447591793e-05,
+      "clip_ratio/low_min": 4.047796210215893e-06,
+      "clip_ratio/region_mean": 4.6340001517819474e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 6702.9375,
+      "completions/mean_terminated_length": 6390.64501953125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.82919991761446,
+      "epoch": 0.41214351425942963,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032975098583847284,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 393788286.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999115467071533,
+      "sampling/importance_sampling_ratio/min": 0.00028582560480572283,
+      "sampling/sampling_logp_difference/max": 8.160128593444824,
+      "sampling/sampling_logp_difference/mean": 0.019461583346128464,
+      "step": 448
+    },
+    {
+      "clip_ratio/high_max": 2.3807599063729867e-05,
+      "clip_ratio/high_mean": 5.951899765932467e-06,
+      "clip_ratio/low_mean": 3.195798365140945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.790988330365508e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15244.0,
+      "completions/mean_length": 6468.9453125,
+      "completions/mean_terminated_length": 5536.7607421875,
+      "completions/min_length": 808.0,
+      "completions/min_terminated_length": 808.0,
+      "entropy": 0.6471721827983856,
+      "epoch": 0.41306347746090155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032787907402962446,
+      "learning_rate": 1e-05,
+      "loss": 0.1149,
+      "num_tokens": 394638159.0,
+      "reward": 0.625,
+      "reward_std": 0.25354722142219543,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 0.00012341380352154374,
+      "sampling/sampling_logp_difference/max": 8.999967575073242,
+      "sampling/sampling_logp_difference/mean": 0.016151495277881622,
+      "step": 449
+    },
+    {
+      "clip_ratio/high_max": 2.247072688987828e-05,
+      "clip_ratio/high_mean": 5.61768172246957e-06,
+      "clip_ratio/low_mean": 6.035319393049576e-05,
+      "clip_ratio/low_min": 4.063190772285452e-06,
+      "clip_ratio/region_mean": 6.597087667614687e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15931.0,
+      "completions/mean_length": 6547.3203125,
+      "completions/mean_terminated_length": 6230.0078125,
+      "completions/min_length": 587.0,
+      "completions/min_terminated_length": 587.0,
+      "entropy": 0.9123960956931114,
+      "epoch": 0.4139834406623735,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038375966250896454,
+      "learning_rate": 1e-05,
+      "loss": 0.0967,
+      "num_tokens": 395493872.0,
+      "reward": 0.4296875,
+      "reward_std": 0.30798619985580444,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.00016009423416107893,
+      "sampling/sampling_logp_difference/max": 8.739748001098633,
+      "sampling/sampling_logp_difference/mean": 0.019957344979047775,
+      "step": 450
+    },
+    {
+      "clip_ratio/high_max": 1.404482372890925e-05,
+      "clip_ratio/high_mean": 3.5112059322273126e-06,
+      "clip_ratio/low_mean": 2.315102483407827e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6662230766305584e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15058.0,
+      "completions/mean_length": 6291.859375,
+      "completions/mean_terminated_length": 6131.6669921875,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 0.9841655194759369,
+      "epoch": 0.41490340386384544,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003903903067111969,
+      "learning_rate": 1e-05,
+      "loss": 0.0656,
+      "num_tokens": 396320254.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2569621503353119,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 6.564632712979801e-06,
+      "sampling/sampling_logp_difference/max": 11.93381404876709,
+      "sampling/sampling_logp_difference/mean": 0.020753150805830956,
+      "step": 451
+    },
+    {
+      "clip_ratio/high_max": 1.5189204987109406e-05,
+      "clip_ratio/high_mean": 4.615214265868417e-06,
+      "clip_ratio/low_mean": 3.547988831087423e-05,
+      "clip_ratio/low_min": 3.3967392027989263e-06,
+      "clip_ratio/region_mean": 4.009510257674265e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15966.0,
+      "completions/mean_length": 7692.4296875,
+      "completions/mean_terminated_length": 7339.11376953125,
+      "completions/min_length": 1269.0,
+      "completions/min_terminated_length": 1269.0,
+      "entropy": 0.94080401211977,
+      "epoch": 0.41582336706531736,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005152889993041754,
+      "learning_rate": 1e-05,
+      "loss": 0.0511,
+      "num_tokens": 397327029.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 5.027571751270443e-05,
+      "sampling/sampling_logp_difference/max": 9.897988319396973,
+      "sampling/sampling_logp_difference/mean": 0.02036213129758835,
+      "step": 452
+    },
+    {
+      "clip_ratio/high_max": 1.733157705530175e-05,
+      "clip_ratio/high_mean": 6.0586507970583625e-06,
+      "clip_ratio/low_mean": 2.335082047011383e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9409470812424843e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15305.0,
+      "completions/mean_length": 6968.0859375,
+      "completions/mean_terminated_length": 6742.1044921875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9254838973283768,
+      "epoch": 0.41674333026678934,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035838852636516094,
+      "learning_rate": 1e-05,
+      "loss": 0.0182,
+      "num_tokens": 398237536.0,
+      "reward": 0.484375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.002404628787189722,
+      "sampling/sampling_logp_difference/max": 6.030359745025635,
+      "sampling/sampling_logp_difference/mean": 0.020200733095407486,
+      "step": 453
+    },
+    {
+      "clip_ratio/high_max": 4.464923677005572e-06,
+      "clip_ratio/high_mean": 1.116230919251393e-06,
+      "clip_ratio/low_mean": 3.311113533754906e-05,
+      "clip_ratio/low_min": 6.725854291289579e-06,
+      "clip_ratio/region_mean": 3.422736637048729e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16309.0,
+      "completions/mean_length": 8711.078125,
+      "completions/mean_terminated_length": 8199.55078125,
+      "completions/min_length": 1049.0,
+      "completions/min_terminated_length": 1049.0,
+      "entropy": 0.8735406622290611,
+      "epoch": 0.41766329346826125,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0036290446296334267,
+      "learning_rate": 1e-05,
+      "loss": 0.0412,
+      "num_tokens": 399373298.0,
+      "reward": 0.359375,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000042200088501,
+      "sampling/importance_sampling_ratio/min": 9.216561011271551e-05,
+      "sampling/sampling_logp_difference/max": 9.291923522949219,
+      "sampling/sampling_logp_difference/mean": 0.0201371181756258,
+      "step": 454
+    },
+    {
+      "clip_ratio/high_max": 3.4702664606811595e-05,
+      "clip_ratio/high_mean": 8.675666151702899e-06,
+      "clip_ratio/low_mean": 3.3217100849469716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.189276808119757e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14737.0,
+      "completions/mean_length": 6891.078125,
+      "completions/mean_terminated_length": 6663.24853515625,
+      "completions/min_length": 827.0,
+      "completions/min_terminated_length": 827.0,
+      "entropy": 0.8689641878008842,
+      "epoch": 0.41858325666973323,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004067540634423494,
+      "learning_rate": 1e-05,
+      "loss": 0.0633,
+      "num_tokens": 400273708.0,
+      "reward": 0.484375,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999425411224365,
+      "sampling/importance_sampling_ratio/min": 4.0002717582865444e-07,
+      "sampling/sampling_logp_difference/max": 14.731733322143555,
+      "sampling/sampling_logp_difference/mean": 0.019800148904323578,
+      "step": 455
+    },
+    {
+      "clip_ratio/high_max": 2.939170826721238e-06,
+      "clip_ratio/high_mean": 7.347927066803095e-07,
+      "clip_ratio/low_mean": 3.564125790944672e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6376050502440194e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15234.0,
+      "completions/mean_length": 6899.3515625,
+      "completions/mean_terminated_length": 6748.8017578125,
+      "completions/min_length": 1149.0,
+      "completions/min_terminated_length": 1149.0,
+      "entropy": 0.9442604705691338,
+      "epoch": 0.41950321987120515,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026191689539700747,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 401177497.0,
+      "reward": 0.46875,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 0.0017910725437104702,
+      "sampling/sampling_logp_difference/max": 6.3249406814575195,
+      "sampling/sampling_logp_difference/mean": 0.021380646154284477,
+      "step": 456
+    },
+    {
+      "clip_ratio/high_max": 8.99604128790088e-06,
+      "clip_ratio/high_mean": 2.24901032197522e-06,
+      "clip_ratio/low_mean": 2.57235833487357e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.797259367071092e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16226.0,
+      "completions/mean_length": 7175.8359375,
+      "completions/mean_terminated_length": 7029.6748046875,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.8653769046068192,
+      "epoch": 0.4204231830726771,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003141516586765647,
+      "learning_rate": 1e-05,
+      "loss": 0.0674,
+      "num_tokens": 402115812.0,
+      "reward": 0.4375,
+      "reward_std": 0.21040895581245422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999862909317017,
+      "sampling/importance_sampling_ratio/min": 0.001265019178390503,
+      "sampling/sampling_logp_difference/max": 6.672667980194092,
+      "sampling/sampling_logp_difference/mean": 0.01970163732767105,
+      "step": 457
+    },
+    {
+      "clip_ratio/high_max": 1.0800059499160852e-05,
+      "clip_ratio/high_mean": 2.700014874790213e-06,
+      "clip_ratio/low_mean": 3.116219727417047e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3862211807900167e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 7090.8515625,
+      "completions/mean_terminated_length": 6791.072265625,
+      "completions/min_length": 606.0,
+      "completions/min_terminated_length": 606.0,
+      "entropy": 0.9437825232744217,
+      "epoch": 0.42134314627414904,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001980370609089732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 403048385.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 1.4011449138706666e-06,
+      "sampling/sampling_logp_difference/max": 13.47822093963623,
+      "sampling/sampling_logp_difference/mean": 0.021090596914291382,
+      "step": 458
+    },
+    {
+      "clip_ratio/high_max": 2.5482850560365478e-05,
+      "clip_ratio/high_mean": 6.370712640091369e-06,
+      "clip_ratio/low_mean": 4.8558076969129615e-05,
+      "clip_ratio/low_min": 4.8952420002024155e-06,
+      "clip_ratio/region_mean": 5.4928788131292094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16175.0,
+      "completions/mean_length": 7033.65625,
+      "completions/mean_terminated_length": 6809.24853515625,
+      "completions/min_length": 1007.0,
+      "completions/min_terminated_length": 1007.0,
+      "entropy": 0.8789731040596962,
+      "epoch": 0.42226310947562096,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003833206370472908,
+      "learning_rate": 1e-05,
+      "loss": 0.059,
+      "num_tokens": 403968037.0,
+      "reward": 0.46875,
+      "reward_std": 0.28460076451301575,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000317096710205,
+      "sampling/importance_sampling_ratio/min": 0.0021942879538983107,
+      "sampling/sampling_logp_difference/max": 6.1218976974487305,
+      "sampling/sampling_logp_difference/mean": 0.019913772121071815,
+      "step": 459
+    },
+    {
+      "clip_ratio/high_max": 4.068877842655638e-06,
+      "clip_ratio/high_mean": 1.0172194606639096e-06,
+      "clip_ratio/low_mean": 6.774969961043098e-05,
+      "clip_ratio/low_min": 3.189914878021227e-06,
+      "clip_ratio/region_mean": 6.876691895740805e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16107.0,
+      "completions/mean_length": 6992.8984375,
+      "completions/mean_terminated_length": 6611.14599609375,
+      "completions/min_length": 754.0,
+      "completions/min_terminated_length": 754.0,
+      "entropy": 0.857115626335144,
+      "epoch": 0.42318307267709293,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005315023008733988,
+      "learning_rate": 1e-05,
+      "loss": 0.1581,
+      "num_tokens": 404881584.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000758171081543,
+      "sampling/importance_sampling_ratio/min": 4.546630952972919e-05,
+      "sampling/sampling_logp_difference/max": 9.998538970947266,
+      "sampling/sampling_logp_difference/mean": 0.01872519962489605,
+      "step": 460
+    },
+    {
+      "clip_ratio/high_max": 1.167047457784065e-05,
+      "clip_ratio/high_mean": 2.9176186444601626e-06,
+      "clip_ratio/low_mean": 3.3195502112448594e-05,
+      "clip_ratio/low_min": 5.25188033861923e-06,
+      "clip_ratio/region_mean": 3.611312064322192e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 6623.2578125,
+      "completions/mean_terminated_length": 6226.4794921875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "entropy": 0.8803941905498505,
+      "epoch": 0.42410303587856485,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0074885934591293335,
+      "learning_rate": 1e-05,
+      "loss": 0.1076,
+      "num_tokens": 405749105.0,
+      "reward": 0.515625,
+      "reward_std": 0.25354722142219543,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.0011723897187039256,
+      "sampling/sampling_logp_difference/max": 6.748711109161377,
+      "sampling/sampling_logp_difference/mean": 0.01930626854300499,
+      "step": 461
+    },
+    {
+      "clip_ratio/high_max": 4.11753080697963e-06,
+      "clip_ratio/high_mean": 1.0293827017449075e-06,
+      "clip_ratio/low_mean": 5.09268712676203e-05,
+      "clip_ratio/low_min": 1.1170248626513057e-05,
+      "clip_ratio/region_mean": 5.195625465148623e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15032.0,
+      "completions/mean_length": 7244.8203125,
+      "completions/mean_terminated_length": 6647.5419921875,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.9202689751982689,
+      "epoch": 0.4250229990800368,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003960717935115099,
+      "learning_rate": 1e-05,
+      "loss": 0.0536,
+      "num_tokens": 406704618.0,
+      "reward": 0.484375,
+      "reward_std": 0.2880108058452606,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 1.69715603988152e-05,
+      "sampling/sampling_logp_difference/max": 10.98397159576416,
+      "sampling/sampling_logp_difference/mean": 0.02019711770117283,
+      "step": 462
+    },
+    {
+      "clip_ratio/high_max": 2.874629831239872e-05,
+      "clip_ratio/high_mean": 1.0519701334033016e-05,
+      "clip_ratio/low_mean": 5.367962035052187e-05,
+      "clip_ratio/low_min": 6.5083827394119e-06,
+      "clip_ratio/region_mean": 6.419932219614566e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 7462.0546875,
+      "completions/mean_terminated_length": 6867.2587890625,
+      "completions/min_length": 669.0,
+      "completions/min_terminated_length": 669.0,
+      "entropy": 0.8141553401947021,
+      "epoch": 0.42594296228150874,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003602087963372469,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 407677177.0,
+      "reward": 0.421875,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999440312385559,
+      "sampling/importance_sampling_ratio/min": 0.0007806668290868402,
+      "sampling/sampling_logp_difference/max": 7.155362129211426,
+      "sampling/sampling_logp_difference/mean": 0.01856713369488716,
+      "step": 463
+    },
+    {
+      "clip_ratio/high_max": 2.6413443720230134e-05,
+      "clip_ratio/high_mean": 8.973188073468918e-06,
+      "clip_ratio/low_mean": 3.5997712757307454e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.497090230870526e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15750.0,
+      "completions/mean_length": 6683.1796875,
+      "completions/mean_terminated_length": 6529.19873046875,
+      "completions/min_length": 775.0,
+      "completions/min_terminated_length": 775.0,
+      "entropy": 0.9070071652531624,
+      "epoch": 0.42686292548298066,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004038481041789055,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 408552512.0,
+      "reward": 0.4609375,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000439882278442,
+      "sampling/importance_sampling_ratio/min": 4.474630986806005e-05,
+      "sampling/sampling_logp_difference/max": 10.014501571655273,
+      "sampling/sampling_logp_difference/mean": 0.02077356167137623,
+      "step": 464
+    },
+    {
+      "clip_ratio/high_max": 1.7171289982798044e-05,
+      "clip_ratio/high_mean": 4.292822495699511e-06,
+      "clip_ratio/low_mean": 3.225401701456576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654683996501262e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15864.0,
+      "completions/mean_length": 6472.9453125,
+      "completions/mean_terminated_length": 5985.51611328125,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8807859197258949,
+      "epoch": 0.42778288868445263,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004457853268831968,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 409399257.0,
+      "reward": 0.421875,
+      "reward_std": 0.20517179369926453,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 0.0017577135004103184,
+      "sampling/sampling_logp_difference/max": 6.343741416931152,
+      "sampling/sampling_logp_difference/mean": 0.020475786179304123,
+      "step": 465
+    },
+    {
+      "clip_ratio/high_max": 5.442162637336878e-05,
+      "clip_ratio/high_mean": 1.584139977239829e-05,
+      "clip_ratio/low_mean": 5.706528349946893e-05,
+      "clip_ratio/low_min": 2.5156462925224332e-05,
+      "clip_ratio/region_mean": 7.290668463610928e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15896.0,
+      "completions/mean_length": 5989.78125,
+      "completions/mean_terminated_length": 5654.48388671875,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.8479711338877678,
+      "epoch": 0.42870285188592455,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033953245729207993,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 410185645.0,
+      "reward": 0.5,
+      "reward_std": 0.3735082745552063,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999676942825317,
+      "sampling/importance_sampling_ratio/min": 1.781588616722729e-05,
+      "sampling/sampling_logp_difference/max": 10.935420036315918,
+      "sampling/sampling_logp_difference/mean": 0.017986344173550606,
+      "step": 466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.2673244681500364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2673244681500364e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 8299.9453125,
+      "completions/mean_terminated_length": 8171.62744140625,
+      "completions/min_length": 1123.0,
+      "completions/min_terminated_length": 1123.0,
+      "entropy": 0.9363152608275414,
+      "epoch": 0.4296228150873965,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002381247701123357,
+      "learning_rate": 1e-05,
+      "loss": 0.0651,
+      "num_tokens": 411268974.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 0.000553094083443284,
+      "sampling/sampling_logp_difference/max": 7.4999823570251465,
+      "sampling/sampling_logp_difference/mean": 0.021354343742132187,
+      "step": 467
+    },
+    {
+      "clip_ratio/high_max": 8.578695997130126e-06,
+      "clip_ratio/high_mean": 2.1446739992825314e-06,
+      "clip_ratio/low_mean": 2.84454882830687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.059016239603807e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14838.0,
+      "completions/mean_length": 7434.0546875,
+      "completions/mean_terminated_length": 7219.25634765625,
+      "completions/min_length": 898.0,
+      "completions/min_terminated_length": 898.0,
+      "entropy": 0.981913685798645,
+      "epoch": 0.43054277828886844,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006341467145830393,
+      "learning_rate": 1e-05,
+      "loss": -0.003,
+      "num_tokens": 412238117.0,
+      "reward": 0.390625,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 0.0019304680172353983,
+      "sampling/sampling_logp_difference/max": 6.249992847442627,
+      "sampling/sampling_logp_difference/mean": 0.02139873616397381,
+      "step": 468
+    },
+    {
+      "clip_ratio/high_max": 1.7187987396027893e-05,
+      "clip_ratio/high_mean": 5.150076049176278e-06,
+      "clip_ratio/low_mean": 5.4699471832009294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.9849548279089504e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15871.0,
+      "completions/mean_length": 7211.1796875,
+      "completions/mean_terminated_length": 7138.95263671875,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "entropy": 0.9307222217321396,
+      "epoch": 0.43146274149034036,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002621602965518832,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 413182860.0,
+      "reward": 0.3203125,
+      "reward_std": 0.34716784954071045,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999529123306274,
+      "sampling/importance_sampling_ratio/min": 5.1446182624204084e-05,
+      "sampling/sampling_logp_difference/max": 9.874974250793457,
+      "sampling/sampling_logp_difference/mean": 0.020250719040632248,
+      "step": 469
+    },
+    {
+      "clip_ratio/high_max": 1.0867412584047997e-05,
+      "clip_ratio/high_mean": 3.9217885614561965e-06,
+      "clip_ratio/low_mean": 4.7740833792886406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.16626223543426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15726.0,
+      "completions/mean_length": 5349.4296875,
+      "completions/mean_terminated_length": 5174.2783203125,
+      "completions/min_length": 983.0,
+      "completions/min_terminated_length": 983.0,
+      "entropy": 1.0213474333286285,
+      "epoch": 0.43238270469181234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035241330042481422,
+      "learning_rate": 1e-05,
+      "loss": 0.0657,
+      "num_tokens": 413885963.0,
+      "reward": 0.3046875,
+      "reward_std": 0.25330984592437744,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999449253082275,
+      "sampling/importance_sampling_ratio/min": 0.0003569081309251487,
+      "sampling/sampling_logp_difference/max": 7.938032150268555,
+      "sampling/sampling_logp_difference/mean": 0.01975759118795395,
+      "step": 470
+    },
+    {
+      "clip_ratio/high_max": 1.469514609198086e-05,
+      "clip_ratio/high_mean": 3.673786522995215e-06,
+      "clip_ratio/low_mean": 2.699725871480041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0671045237795624e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15357.0,
+      "completions/mean_length": 7542.8515625,
+      "completions/mean_terminated_length": 7257.65283203125,
+      "completions/min_length": 1359.0,
+      "completions/min_terminated_length": 1359.0,
+      "entropy": 0.8882969543337822,
+      "epoch": 0.43330266789328425,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014164346503093839,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 414870560.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20753081142902374,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000402927398682,
+      "sampling/importance_sampling_ratio/min": 6.435441900976002e-05,
+      "sampling/sampling_logp_difference/max": 9.651104927062988,
+      "sampling/sampling_logp_difference/mean": 0.020874422043561935,
+      "step": 471
+    },
+    {
+      "clip_ratio/high_max": 1.669827497607912e-05,
+      "clip_ratio/high_mean": 4.17456874401978e-06,
+      "clip_ratio/low_mean": 3.673103901746799e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.090560787517461e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7286.90625,
+      "completions/mean_terminated_length": 6993.451171875,
+      "completions/min_length": 977.0,
+      "completions/min_terminated_length": 977.0,
+      "entropy": 0.9254636988043785,
+      "epoch": 0.43422263109475623,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026956009678542614,
+      "learning_rate": 1e-05,
+      "loss": 0.0567,
+      "num_tokens": 415825252.0,
+      "reward": 0.328125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999917209148407,
+      "sampling/importance_sampling_ratio/min": 0.0019701423589140177,
+      "sampling/sampling_logp_difference/max": 6.229649543762207,
+      "sampling/sampling_logp_difference/mean": 0.0202642735093832,
+      "step": 472
+    },
+    {
+      "clip_ratio/high_max": 9.162045444099931e-06,
+      "clip_ratio/high_mean": 2.2905113610249828e-06,
+      "clip_ratio/low_mean": 3.818475033767754e-05,
+      "clip_ratio/low_min": 7.20606476534158e-06,
+      "clip_ratio/region_mean": 4.047526181238936e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15908.0,
+      "completions/mean_length": 7244.7421875,
+      "completions/mean_terminated_length": 6716.0244140625,
+      "completions/min_length": 1010.0,
+      "completions/min_terminated_length": 1010.0,
+      "entropy": 0.7817923128604889,
+      "epoch": 0.43514259429622815,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022128887940198183,
+      "learning_rate": 1e-05,
+      "loss": 0.0577,
+      "num_tokens": 416774011.0,
+      "reward": 0.453125,
+      "reward_std": 0.2937847375869751,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0015034435782581568,
+      "sampling/sampling_logp_difference/max": 6.499997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01840684749186039,
+      "step": 473
+    },
+    {
+      "clip_ratio/high_max": 1.2232871313244686e-05,
+      "clip_ratio/high_mean": 3.0582178283111716e-06,
+      "clip_ratio/low_mean": 3.636896872194484e-05,
+      "clip_ratio/low_min": 3.1460788250115e-06,
+      "clip_ratio/region_mean": 3.9427186266038916e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16254.0,
+      "completions/mean_length": 9042.90625,
+      "completions/mean_terminated_length": 8283.482421875,
+      "completions/min_length": 997.0,
+      "completions/min_terminated_length": 997.0,
+      "entropy": 0.9306210279464722,
+      "epoch": 0.43606255749770007,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034676652867347,
+      "learning_rate": 1e-05,
+      "loss": 0.0504,
+      "num_tokens": 417951311.0,
+      "reward": 0.265625,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999234080314636,
+      "sampling/importance_sampling_ratio/min": 0.0002641192404553294,
+      "sampling/sampling_logp_difference/max": 8.239109992980957,
+      "sampling/sampling_logp_difference/mean": 0.02112819254398346,
+      "step": 474
+    },
+    {
+      "clip_ratio/high_max": 2.5187824576278217e-05,
+      "clip_ratio/high_mean": 8.202394610634656e-06,
+      "clip_ratio/low_mean": 4.3606626604741905e-05,
+      "clip_ratio/low_min": 3.5752079838857753e-06,
+      "clip_ratio/region_mean": 5.1809020988002885e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15721.0,
+      "completions/mean_length": 6763.6328125,
+      "completions/mean_terminated_length": 6610.9287109375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9879302233457565,
+      "epoch": 0.43698252069917204,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030218157917261124,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 418836184.0,
+      "reward": 0.484375,
+      "reward_std": 0.30091896653175354,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 0.0003778560785576701,
+      "sampling/sampling_logp_difference/max": 7.880997180938721,
+      "sampling/sampling_logp_difference/mean": 0.021101050078868866,
+      "step": 475
+    },
+    {
+      "clip_ratio/high_max": 1.0644185749697499e-05,
+      "clip_ratio/high_mean": 2.6610464374243747e-06,
+      "clip_ratio/low_mean": 6.21261324340594e-05,
+      "clip_ratio/low_min": 3.6509140954876784e-06,
+      "clip_ratio/region_mean": 6.478717887148377e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15675.0,
+      "completions/mean_length": 6794.25,
+      "completions/mean_terminated_length": 6564.09619140625,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 1.0259138569235802,
+      "epoch": 0.43790248390064396,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002881827764213085,
+      "learning_rate": 1e-05,
+      "loss": 0.0592,
+      "num_tokens": 419726192.0,
+      "reward": 0.265625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999275207519531,
+      "sampling/importance_sampling_ratio/min": 9.217044407705544e-07,
+      "sampling/sampling_logp_difference/max": 13.897041320800781,
+      "sampling/sampling_logp_difference/mean": 0.0210823193192482,
+      "step": 476
+    },
+    {
+      "clip_ratio/high_max": 1.108860487875063e-05,
+      "clip_ratio/high_mean": 2.7721512196876574e-06,
+      "clip_ratio/low_mean": 4.70996876629215e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9871839337356505e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14281.0,
+      "completions/max_terminated_length": 14281.0,
+      "completions/mean_length": 5648.2109375,
+      "completions/mean_terminated_length": 5648.2109375,
+      "completions/min_length": 935.0,
+      "completions/min_terminated_length": 935.0,
+      "entropy": 0.88894472271204,
+      "epoch": 0.43882244710211593,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00289533962495625,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 420468867.0,
+      "reward": 0.484375,
+      "reward_std": 0.2675113081932068,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998449087142944,
+      "sampling/importance_sampling_ratio/min": 0.001372925122268498,
+      "sampling/sampling_logp_difference/max": 6.590811729431152,
+      "sampling/sampling_logp_difference/mean": 0.018499158322811127,
+      "step": 477
+    },
+    {
+      "clip_ratio/high_max": 4.753574557980755e-06,
+      "clip_ratio/high_mean": 1.1883936394951888e-06,
+      "clip_ratio/low_mean": 2.4103785335682915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5292179316238617e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15657.0,
+      "completions/mean_length": 6188.359375,
+      "completions/mean_terminated_length": 6026.52392578125,
+      "completions/min_length": 1085.0,
+      "completions/min_terminated_length": 1085.0,
+      "entropy": 0.8476063013076782,
+      "epoch": 0.43974241030358785,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.002749695209786296,
+      "learning_rate": 1e-05,
+      "loss": 0.0012,
+      "num_tokens": 421280881.0,
+      "reward": 0.3671875,
+      "reward_std": 0.15991678833961487,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796152114868,
+      "sampling/importance_sampling_ratio/min": 0.004578418098390102,
+      "sampling/sampling_logp_difference/max": 5.386401653289795,
+      "sampling/sampling_logp_difference/mean": 0.018456483259797096,
+      "step": 478
+    },
+    {
+      "clip_ratio/high_max": 4.1359915030625416e-05,
+      "clip_ratio/high_mean": 1.0339978757656354e-05,
+      "clip_ratio/low_mean": 4.786080125995795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8200780586048495e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 6864.3515625,
+      "completions/mean_terminated_length": 6635.88037109375,
+      "completions/min_length": 1065.0,
+      "completions/min_terminated_length": 1065.0,
+      "entropy": 0.8666203916072845,
+      "epoch": 0.4406623735050598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.005116373300552368,
+      "learning_rate": 1e-05,
+      "loss": 0.0347,
+      "num_tokens": 422177822.0,
+      "reward": 0.4453125,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 0.00020385721290949732,
+      "sampling/sampling_logp_difference/max": 8.498090744018555,
+      "sampling/sampling_logp_difference/mean": 0.01979806460440159,
+      "step": 479
+    },
+    {
+      "clip_ratio/high_max": 1.4544774558089557e-05,
+      "clip_ratio/high_mean": 3.6361936395223893e-06,
+      "clip_ratio/low_mean": 4.153812756158004e-05,
+      "clip_ratio/low_min": 3.606462769312202e-06,
+      "clip_ratio/region_mean": 4.51743208031985e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 7023.828125,
+      "completions/mean_terminated_length": 6799.18408203125,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9098334684967995,
+      "epoch": 0.44158233670653174,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0020944855641573668,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 423096576.0,
+      "reward": 0.2734375,
+      "reward_std": 0.20858672261238098,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999480247497559,
+      "sampling/importance_sampling_ratio/min": 0.0027383591514080763,
+      "sampling/sampling_logp_difference/max": 5.900396347045898,
+      "sampling/sampling_logp_difference/mean": 0.020111342892050743,
+      "step": 480
+    },
+    {
+      "clip_ratio/high_max": 3.256236095694476e-05,
+      "clip_ratio/high_mean": 1.2372795026749372e-05,
+      "clip_ratio/low_mean": 5.0774355258909054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.314715119515313e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15527.0,
+      "completions/mean_length": 6666.828125,
+      "completions/mean_terminated_length": 6512.587890625,
+      "completions/min_length": 872.0,
+      "completions/min_terminated_length": 872.0,
+      "entropy": 0.9162466824054718,
+      "epoch": 0.44250229990800366,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003897767048329115,
+      "learning_rate": 1e-05,
+      "loss": 0.1151,
+      "num_tokens": 423968050.0,
+      "reward": 0.46875,
+      "reward_std": 0.3527044653892517,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0031828521750867367,
+      "sampling/sampling_logp_difference/max": 5.7499775886535645,
+      "sampling/sampling_logp_difference/mean": 0.019923247396945953,
+      "step": 481
+    },
+    {
+      "clip_ratio/high_max": 1.5341902098953142e-05,
+      "clip_ratio/high_mean": 4.791600815678976e-06,
+      "clip_ratio/low_mean": 7.980174223121139e-05,
+      "clip_ratio/low_min": 2.6713308216130827e-05,
+      "clip_ratio/region_mean": 8.459334412691533e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16223.0,
+      "completions/mean_length": 7159.8046875,
+      "completions/mean_terminated_length": 7013.38916015625,
+      "completions/min_length": 1022.0,
+      "completions/min_terminated_length": 1022.0,
+      "entropy": 0.8444746807217598,
+      "epoch": 0.44342226310947563,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003038195427507162,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 424902953.0,
+      "reward": 0.359375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940037727356,
+      "sampling/importance_sampling_ratio/min": 7.431909580191132e-06,
+      "sampling/sampling_logp_difference/max": 11.809727668762207,
+      "sampling/sampling_logp_difference/mean": 0.019014043733477592,
+      "step": 482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.55851120666739e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.55851120666739e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14716.0,
+      "completions/mean_length": 6146.2109375,
+      "completions/mean_terminated_length": 6065.5986328125,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "entropy": 0.8365580290555954,
+      "epoch": 0.44434222631094755,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025550283025950193,
+      "learning_rate": 1e-05,
+      "loss": 0.0548,
+      "num_tokens": 425709212.0,
+      "reward": 0.5625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000015497207642,
+      "sampling/importance_sampling_ratio/min": 0.0006884043687023222,
+      "sampling/sampling_logp_difference/max": 7.281134128570557,
+      "sampling/sampling_logp_difference/mean": 0.019193854182958603,
+      "step": 483
+    },
+    {
+      "clip_ratio/high_max": 2.4752349872869672e-05,
+      "clip_ratio/high_mean": 7.036488455014478e-06,
+      "clip_ratio/low_mean": 4.780410063176532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.484058920046664e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16153.0,
+      "completions/mean_length": 6557.578125,
+      "completions/mean_terminated_length": 6321.744140625,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.8316832035779953,
+      "epoch": 0.4452621895124195,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005126865580677986,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 426566462.0,
+      "reward": 0.484375,
+      "reward_std": 0.27852246165275574,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 2.7536634661373682e-05,
+      "sampling/sampling_logp_difference/max": 10.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01839536987245083,
+      "step": 484
+    },
+    {
+      "clip_ratio/high_max": 3.443571449679439e-05,
+      "clip_ratio/high_mean": 8.608928624198597e-06,
+      "clip_ratio/low_mean": 5.915772453590762e-05,
+      "clip_ratio/low_min": 1.7084812043322017e-05,
+      "clip_ratio/region_mean": 6.776665304641938e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16359.0,
+      "completions/mean_length": 7007.3203125,
+      "completions/mean_terminated_length": 6858.484375,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8674142584204674,
+      "epoch": 0.44618215271389144,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004829525947570801,
+      "learning_rate": 1e-05,
+      "loss": 0.0753,
+      "num_tokens": 427480007.0,
+      "reward": 0.46875,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998922944068909,
+      "sampling/importance_sampling_ratio/min": 0.00020170137577224523,
+      "sampling/sampling_logp_difference/max": 8.508722305297852,
+      "sampling/sampling_logp_difference/mean": 0.019586069509387016,
+      "step": 485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.539863354897534e-05,
+      "clip_ratio/low_min": 8.211341992137022e-06,
+      "clip_ratio/region_mean": 5.539863354897534e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14748.0,
+      "completions/mean_length": 7069.8828125,
+      "completions/mean_terminated_length": 6922.0400390625,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.9066255167126656,
+      "epoch": 0.44710211591536336,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003539952216669917,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 428404968.0,
+      "reward": 0.5,
+      "reward_std": 0.3618982434272766,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 0.00024052867956925184,
+      "sampling/sampling_logp_difference/max": 8.332671165466309,
+      "sampling/sampling_logp_difference/mean": 0.020427238196134567,
+      "step": 486
+    },
+    {
+      "clip_ratio/high_max": 1.6550495729461545e-05,
+      "clip_ratio/high_mean": 4.137623932365386e-06,
+      "clip_ratio/low_mean": 5.576918465521885e-05,
+      "clip_ratio/low_min": 1.2613936178240692e-05,
+      "clip_ratio/region_mean": 5.99068093833921e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15290.0,
+      "completions/max_terminated_length": 15290.0,
+      "completions/mean_length": 5586.6875,
+      "completions/mean_terminated_length": 5586.6875,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.9208655655384064,
+      "epoch": 0.44802207911683534,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0030504625756293535,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 429137176.0,
+      "reward": 0.515625,
+      "reward_std": 0.3480040729045868,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999984502792358,
+      "sampling/importance_sampling_ratio/min": 0.0005498559912666678,
+      "sampling/sampling_logp_difference/max": 7.50585412979126,
+      "sampling/sampling_logp_difference/mean": 0.019396595656871796,
+      "step": 487
+    },
+    {
+      "clip_ratio/high_max": 3.3761509712348925e-05,
+      "clip_ratio/high_mean": 8.440377428087231e-06,
+      "clip_ratio/low_mean": 3.6384140912559815e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.482451868170756e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15404.0,
+      "completions/mean_length": 5266.265625,
+      "completions/mean_terminated_length": 4999.4404296875,
+      "completions/min_length": 492.0,
+      "completions/min_terminated_length": 492.0,
+      "entropy": 0.7884859293699265,
+      "epoch": 0.44894204231830726,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003902251599356532,
+      "learning_rate": 1e-05,
+      "loss": -0.0077,
+      "num_tokens": 429836026.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.05675617232918739,
+      "sampling/sampling_logp_difference/max": 2.868990898132324,
+      "sampling/sampling_logp_difference/mean": 0.01770034246146679,
+      "step": 488
+    },
+    {
+      "clip_ratio/high_max": 2.2323702978610527e-05,
+      "clip_ratio/high_mean": 5.580925744652632e-06,
+      "clip_ratio/low_mean": 4.0199149452746497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.578007497002545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6398.53125,
+      "completions/mean_terminated_length": 6319.9052734375,
+      "completions/min_length": 699.0,
+      "completions/min_terminated_length": 699.0,
+      "entropy": 0.8982341960072517,
+      "epoch": 0.44986200551977923,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024998660665005445,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 430673446.0,
+      "reward": 0.421875,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797940254211,
+      "sampling/importance_sampling_ratio/min": 0.000612784584518522,
+      "sampling/sampling_logp_difference/max": 7.397497177124023,
+      "sampling/sampling_logp_difference/mean": 0.020521972328424454,
+      "step": 489
+    },
+    {
+      "clip_ratio/high_max": 3.1756624366607866e-05,
+      "clip_ratio/high_mean": 7.939156091651967e-06,
+      "clip_ratio/low_mean": 8.124458963720826e-05,
+      "clip_ratio/low_min": 1.2379174222587608e-05,
+      "clip_ratio/region_mean": 8.91837471499457e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14374.0,
+      "completions/mean_length": 6277.65625,
+      "completions/mean_terminated_length": 6198.07861328125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8139145970344543,
+      "epoch": 0.45078196872125115,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00784115307033062,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 431497546.0,
+      "reward": 0.546875,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999848484992981,
+      "sampling/importance_sampling_ratio/min": 0.0006267798598855734,
+      "sampling/sampling_logp_difference/max": 7.37491512298584,
+      "sampling/sampling_logp_difference/mean": 0.01836184598505497,
+      "step": 490
+    },
+    {
+      "clip_ratio/high_max": 8.875004823494237e-06,
+      "clip_ratio/high_mean": 2.2187512058735592e-06,
+      "clip_ratio/low_mean": 2.3825880248296016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6044631454169576e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15903.0,
+      "completions/mean_length": 7708.59375,
+      "completions/mean_terminated_length": 7355.9345703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.087083138525486,
+      "epoch": 0.45170193192272307,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.004277343396097422,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 432503414.0,
+      "reward": 0.2890625,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999503493309021,
+      "sampling/importance_sampling_ratio/min": 1.2187546417408157e-05,
+      "sampling/sampling_logp_difference/max": 11.315095901489258,
+      "sampling/sampling_logp_difference/mean": 0.02224145457148552,
+      "step": 491
+    },
+    {
+      "clip_ratio/high_max": 6.384065272868611e-06,
+      "clip_ratio/high_mean": 1.5960163182171527e-06,
+      "clip_ratio/low_mean": 3.561227788395627e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.720829374742607e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7162.7109375,
+      "completions/mean_terminated_length": 6865.25,
+      "completions/min_length": 842.0,
+      "completions/min_terminated_length": 842.0,
+      "entropy": 0.9157010763883591,
+      "epoch": 0.45262189512419504,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006278311368077993,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 433439137.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2227931171655655,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966561794281,
+      "sampling/importance_sampling_ratio/min": 0.0005532125360332429,
+      "sampling/sampling_logp_difference/max": 7.499768257141113,
+      "sampling/sampling_logp_difference/mean": 0.02123419940471649,
+      "step": 492
+    },
+    {
+      "clip_ratio/high_max": 2.846911434062349e-05,
+      "clip_ratio/high_mean": 8.656040449750435e-06,
+      "clip_ratio/low_mean": 5.1716241614485625e-05,
+      "clip_ratio/low_min": 3.601579010137357e-06,
+      "clip_ratio/region_mean": 6.037228104105452e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16123.0,
+      "completions/mean_length": 7388.90625,
+      "completions/mean_terminated_length": 7023.251953125,
+      "completions/min_length": 980.0,
+      "completions/min_terminated_length": 980.0,
+      "entropy": 0.7670486867427826,
+      "epoch": 0.45354185832566696,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005177734419703484,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "num_tokens": 434402045.0,
+      "reward": 0.3828125,
+      "reward_std": 0.37951958179473877,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999250769615173,
+      "sampling/importance_sampling_ratio/min": 0.0022511729039251804,
+      "sampling/sampling_logp_difference/max": 6.096303939819336,
+      "sampling/sampling_logp_difference/mean": 0.01827731542289257,
+      "step": 493
+    },
+    {
+      "clip_ratio/high_max": 2.1548471977439476e-05,
+      "clip_ratio/high_mean": 6.257203722270788e-06,
+      "clip_ratio/low_mean": 7.719641234871233e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.345361538886209e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 6805.375,
+      "completions/mean_terminated_length": 6496.38671875,
+      "completions/min_length": 587.0,
+      "completions/min_terminated_length": 587.0,
+      "entropy": 0.8407405763864517,
+      "epoch": 0.45446182152713893,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032320048194378614,
+      "learning_rate": 1e-05,
+      "loss": 0.0662,
+      "num_tokens": 435292029.0,
+      "reward": 0.4296875,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999642372131348,
+      "sampling/importance_sampling_ratio/min": 6.679954094579443e-05,
+      "sampling/sampling_logp_difference/max": 9.613814353942871,
+      "sampling/sampling_logp_difference/mean": 0.018761277198791504,
+      "step": 494
+    },
+    {
+      "clip_ratio/high_max": 3.460495008766884e-06,
+      "clip_ratio/high_mean": 8.65123752191721e-07,
+      "clip_ratio/low_mean": 7.76378024056612e-05,
+      "clip_ratio/low_min": 1.7026316072588088e-05,
+      "clip_ratio/region_mean": 7.850292649891344e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15105.0,
+      "completions/mean_length": 5753.4140625,
+      "completions/mean_terminated_length": 5321.2763671875,
+      "completions/min_length": 946.0,
+      "completions/min_terminated_length": 946.0,
+      "entropy": 0.7848984077572823,
+      "epoch": 0.45538178472861085,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030854379292577505,
+      "learning_rate": 1e-05,
+      "loss": 0.0279,
+      "num_tokens": 436046842.0,
+      "reward": 0.578125,
+      "reward_std": 0.31405961513519287,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998626708984375,
+      "sampling/importance_sampling_ratio/min": 4.36544311810394e-09,
+      "sampling/sampling_logp_difference/max": 19.24954605102539,
+      "sampling/sampling_logp_difference/mean": 0.017733070999383926,
+      "step": 495
+    },
+    {
+      "clip_ratio/high_max": 1.7207588371093152e-05,
+      "clip_ratio/high_mean": 4.301897092773288e-06,
+      "clip_ratio/low_mean": 3.234025916754035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.664215591925313e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6522.84375,
+      "completions/mean_terminated_length": 6445.19677734375,
+      "completions/min_length": 1062.0,
+      "completions/min_terminated_length": 1062.0,
+      "entropy": 1.0593653172254562,
+      "epoch": 0.4563017479300828,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003124243812635541,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 436899638.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2706219553947449,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999418258666992,
+      "sampling/importance_sampling_ratio/min": 4.476920821616659e-06,
+      "sampling/sampling_logp_difference/max": 12.316575050354004,
+      "sampling/sampling_logp_difference/mean": 0.021180003881454468,
+      "step": 496
+    },
+    {
+      "clip_ratio/high_max": 1.1790433973146719e-05,
+      "clip_ratio/high_mean": 2.9476084932866797e-06,
+      "clip_ratio/low_mean": 2.8437304308681632e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.138491274512489e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14515.0,
+      "completions/mean_length": 6203.203125,
+      "completions/mean_terminated_length": 5874.7900390625,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.8152795508503914,
+      "epoch": 0.45722171113155474,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005001795012503862,
+      "learning_rate": 1e-05,
+      "loss": 0.0817,
+      "num_tokens": 437713008.0,
+      "reward": 0.4296875,
+      "reward_std": 0.26143795251846313,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101758003235,
+      "sampling/importance_sampling_ratio/min": 0.001757707679644227,
+      "sampling/sampling_logp_difference/max": 6.34374475479126,
+      "sampling/sampling_logp_difference/mean": 0.017751028761267662,
+      "step": 497
+    },
+    {
+      "clip_ratio/high_max": 1.3163793028070359e-05,
+      "clip_ratio/high_mean": 4.229499381835922e-06,
+      "clip_ratio/low_mean": 4.4599403963729856e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.882890357293945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15423.0,
+      "completions/mean_length": 5975.5234375,
+      "completions/mean_terminated_length": 5725.72021484375,
+      "completions/min_length": 690.0,
+      "completions/min_terminated_length": 690.0,
+      "entropy": 0.8275932744145393,
+      "epoch": 0.45814167433302666,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005084732081741095,
+      "learning_rate": 1e-05,
+      "loss": 0.0759,
+      "num_tokens": 438495811.0,
+      "reward": 0.5390625,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998699426651001,
+      "sampling/importance_sampling_ratio/min": 3.120788460364565e-05,
+      "sampling/sampling_logp_difference/max": 10.374839782714844,
+      "sampling/sampling_logp_difference/mean": 0.018671832978725433,
+      "step": 498
+    },
+    {
+      "clip_ratio/high_max": 3.229640242352616e-06,
+      "clip_ratio/high_mean": 8.07410060588154e-07,
+      "clip_ratio/low_mean": 3.0413870263146237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1221280551108066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 7019.59375,
+      "completions/mean_terminated_length": 7019.59375,
+      "completions/min_length": 1058.0,
+      "completions/min_terminated_length": 1058.0,
+      "entropy": 0.9266618490219116,
+      "epoch": 0.45906163753449863,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002567912917584181,
+      "learning_rate": 1e-05,
+      "loss": 0.0282,
+      "num_tokens": 439413055.0,
+      "reward": 0.375,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000476837158203,
+      "sampling/importance_sampling_ratio/min": 0.0010315657127648592,
+      "sampling/sampling_logp_difference/max": 6.876677513122559,
+      "sampling/sampling_logp_difference/mean": 0.02012534812092781,
+      "step": 499
+    },
+    {
+      "clip_ratio/high_max": 1.8327779343962902e-05,
+      "clip_ratio/high_mean": 4.5819448359907256e-06,
+      "clip_ratio/low_mean": 4.08189575864526e-05,
+      "clip_ratio/low_min": 4.041122338094283e-06,
+      "clip_ratio/region_mean": 4.5400901854009135e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 7373.3203125,
+      "completions/mean_terminated_length": 7082.65283203125,
+      "completions/min_length": 854.0,
+      "completions/min_terminated_length": 854.0,
+      "entropy": 0.9383682310581207,
+      "epoch": 0.45998160073597055,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004862098954617977,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 440375128.0,
+      "reward": 0.4375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.0006883886526338756,
+      "sampling/sampling_logp_difference/max": 7.28115701675415,
+      "sampling/sampling_logp_difference/mean": 0.020596595481038094,
+      "step": 500
+    },
+    {
+      "clip_ratio/high_max": 1.650619151405408e-05,
+      "clip_ratio/high_mean": 4.12654787851352e-06,
+      "clip_ratio/low_mean": 6.364750265674957e-05,
+      "clip_ratio/low_min": 3.94595599573222e-06,
+      "clip_ratio/region_mean": 6.77740499668289e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16280.0,
+      "completions/mean_length": 5944.953125,
+      "completions/mean_terminated_length": 5862.755859375,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "entropy": 0.9130716845393181,
+      "epoch": 0.4609015639374425,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041388699784875,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 441156306.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999566078186035,
+      "sampling/importance_sampling_ratio/min": 0.0007685241289436817,
+      "sampling/sampling_logp_difference/max": 7.171038627624512,
+      "sampling/sampling_logp_difference/mean": 0.019817989319562912,
+      "step": 501
+    },
+    {
+      "clip_ratio/high_max": 2.9951792839710834e-05,
+      "clip_ratio/high_mean": 9.205811807078135e-06,
+      "clip_ratio/low_mean": 3.147234815514821e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0678160075913183e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16181.0,
+      "completions/mean_length": 6686.015625,
+      "completions/mean_terminated_length": 6609.6533203125,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 0.8640913739800453,
+      "epoch": 0.46182152713891444,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005679543130099773,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 442032972.0,
+      "reward": 0.5546875,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 0.007731473073363304,
+      "sampling/sampling_logp_difference/max": 4.86245584487915,
+      "sampling/sampling_logp_difference/mean": 0.019738182425498962,
+      "step": 502
+    },
+    {
+      "clip_ratio/high_max": 3.0190597726686974e-05,
+      "clip_ratio/high_mean": 7.5476494316717435e-06,
+      "clip_ratio/low_mean": 3.858067566397949e-05,
+      "clip_ratio/low_min": 9.290916750614997e-06,
+      "clip_ratio/region_mean": 4.612832617567619e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 6945.5,
+      "completions/mean_terminated_length": 6231.6640625,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.8156519457697868,
+      "epoch": 0.46274149034038636,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006176612339913845,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 442940940.0,
+      "reward": 0.46875,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999117851257324,
+      "sampling/importance_sampling_ratio/min": 0.00018278000061400235,
+      "sampling/sampling_logp_difference/max": 8.607227325439453,
+      "sampling/sampling_logp_difference/mean": 0.01836501806974411,
+      "step": 503
+    },
+    {
+      "clip_ratio/high_max": 2.2105000425653998e-05,
+      "clip_ratio/high_mean": 6.28071654773521e-06,
+      "clip_ratio/low_mean": 3.060894187001395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6889658531436e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15847.0,
+      "completions/mean_length": 8068.5390625,
+      "completions/mean_terminated_length": 7363.8388671875,
+      "completions/min_length": 875.0,
+      "completions/min_terminated_length": 875.0,
+      "entropy": 0.8196670189499855,
+      "epoch": 0.46366145354185834,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021770994644612074,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 443992041.0,
+      "reward": 0.4453125,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999759197235107,
+      "sampling/importance_sampling_ratio/min": 0.0001795605494407937,
+      "sampling/sampling_logp_difference/max": 8.624998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019003838300704956,
+      "step": 504
+    },
+    {
+      "clip_ratio/high_max": 1.287241002501105e-05,
+      "clip_ratio/high_mean": 3.2181025062527624e-06,
+      "clip_ratio/low_mean": 4.5685408849749365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.89035115833758e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15168.0,
+      "completions/mean_length": 5209.140625,
+      "completions/mean_terminated_length": 5031.76220703125,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "entropy": 0.8851845487952232,
+      "epoch": 0.46458141674333026,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00788798462599516,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 444679675.0,
+      "reward": 0.4609375,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796748161316,
+      "sampling/importance_sampling_ratio/min": 0.00025673024356365204,
+      "sampling/sampling_logp_difference/max": 8.267484664916992,
+      "sampling/sampling_logp_difference/mean": 0.018808994442224503,
+      "step": 505
+    },
+    {
+      "clip_ratio/high_max": 2.294301202709903e-05,
+      "clip_ratio/high_mean": 6.590465602585027e-06,
+      "clip_ratio/low_mean": 5.944662643742049e-05,
+      "clip_ratio/low_min": 8.106994755507912e-06,
+      "clip_ratio/region_mean": 6.603709243790945e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16259.0,
+      "completions/mean_length": 7558.8984375,
+      "completions/mean_terminated_length": 7274.21728515625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.003449946641922,
+      "epoch": 0.46550137994480223,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004547314252704382,
+      "learning_rate": 1e-05,
+      "loss": 0.1586,
+      "num_tokens": 445668126.0,
+      "reward": 0.421875,
+      "reward_std": 0.42293959856033325,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999848484992981,
+      "sampling/importance_sampling_ratio/min": 0.00011622780584730208,
+      "sampling/sampling_logp_difference/max": 9.059958457946777,
+      "sampling/sampling_logp_difference/mean": 0.02099413052201271,
+      "step": 506
+    },
+    {
+      "clip_ratio/high_max": 2.1350435872591333e-05,
+      "clip_ratio/high_mean": 6.047981628398702e-06,
+      "clip_ratio/low_mean": 8.880347786544007e-05,
+      "clip_ratio/low_min": 9.06585455595632e-06,
+      "clip_ratio/region_mean": 9.485145938015194e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16137.0,
+      "completions/max_terminated_length": 16137.0,
+      "completions/mean_length": 6066.6015625,
+      "completions/mean_terminated_length": 6066.6015625,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "entropy": 0.8450648710131645,
+      "epoch": 0.46642134314627415,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004621773958206177,
+      "learning_rate": 1e-05,
+      "loss": 0.121,
+      "num_tokens": 446464587.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000154972076416,
+      "sampling/importance_sampling_ratio/min": 1.3950601896794979e-05,
+      "sampling/sampling_logp_difference/max": 11.179987907409668,
+      "sampling/sampling_logp_difference/mean": 0.018016980960965157,
+      "step": 507
+    },
+    {
+      "clip_ratio/high_max": 3.0534724828612525e-06,
+      "clip_ratio/high_mean": 7.633681207153131e-07,
+      "clip_ratio/low_mean": 2.149350007130124e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2256868305703392e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 6988.0234375,
+      "completions/mean_terminated_length": 6838.88134765625,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 1.0452716201543808,
+      "epoch": 0.46734130634774607,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004523546434938908,
+      "learning_rate": 1e-05,
+      "loss": 0.0396,
+      "num_tokens": 447381134.0,
+      "reward": 0.3515625,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999901056289673,
+      "sampling/importance_sampling_ratio/min": 0.016167031601071358,
+      "sampling/sampling_logp_difference/max": 4.124781131744385,
+      "sampling/sampling_logp_difference/mean": 0.021812722086906433,
+      "step": 508
+    },
+    {
+      "clip_ratio/high_max": 5.58759120394825e-06,
+      "clip_ratio/high_mean": 1.3968978009870625e-06,
+      "clip_ratio/low_mean": 3.684896307731833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.824586099199223e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12316.0,
+      "completions/max_terminated_length": 12316.0,
+      "completions/mean_length": 5948.5,
+      "completions/mean_terminated_length": 5948.5,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8241566568613052,
+      "epoch": 0.46826126954921804,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004002885892987251,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 448158014.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 0.0008566387114115059,
+      "sampling/sampling_logp_difference/max": 7.062494277954102,
+      "sampling/sampling_logp_difference/mean": 0.018487900495529175,
+      "step": 509
+    },
+    {
+      "clip_ratio/high_max": 1.0490723752809572e-05,
+      "clip_ratio/high_mean": 3.439610338773491e-06,
+      "clip_ratio/low_mean": 3.973086239739132e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3170473020381905e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16044.0,
+      "completions/mean_length": 7966.375,
+      "completions/mean_terminated_length": 7764.3525390625,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.8868448063731194,
+      "epoch": 0.46918123275068996,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019062751671299338,
+      "learning_rate": 1e-05,
+      "loss": 0.0787,
+      "num_tokens": 449197054.0,
+      "reward": 0.40625,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0001614262000657618,
+      "sampling/sampling_logp_difference/max": 8.731462478637695,
+      "sampling/sampling_logp_difference/mean": 0.020015282556414604,
+      "step": 510
+    },
+    {
+      "clip_ratio/high_max": 1.2195105682621943e-05,
+      "clip_ratio/high_mean": 3.0487764206554857e-06,
+      "clip_ratio/low_mean": 3.558348203114292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8632259474979946e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16124.0,
+      "completions/mean_length": 6520.0234375,
+      "completions/mean_terminated_length": 6442.3544921875,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9168323278427124,
+      "epoch": 0.47010119595216193,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00490277074277401,
+      "learning_rate": 1e-05,
+      "loss": 0.0547,
+      "num_tokens": 450050153.0,
+      "reward": 0.484375,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 4.4418397919798736e-06,
+      "sampling/sampling_logp_difference/max": 12.324441909790039,
+      "sampling/sampling_logp_difference/mean": 0.020178331062197685,
+      "step": 511
+    },
+    {
+      "clip_ratio/high_max": 7.95772848505294e-06,
+      "clip_ratio/high_mean": 1.989432121263235e-06,
+      "clip_ratio/low_mean": 3.363800146871654e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.562743381735345e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 6614.5625,
+      "completions/mean_terminated_length": 6217.4306640625,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.8635925352573395,
+      "epoch": 0.47102115915363385,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003792276605963707,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 450915281.0,
+      "reward": 0.5,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999154806137085,
+      "sampling/importance_sampling_ratio/min": 0.004489119164645672,
+      "sampling/sampling_logp_difference/max": 5.40609884262085,
+      "sampling/sampling_logp_difference/mean": 0.019233014434576035,
+      "step": 512
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 450915281,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-512/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-512/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/README.md b/dapo_milora_plus_20251201_131939/checkpoint-576/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-576/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-576/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-576/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/latest b/dapo_milora_plus_20251201_131939/checkpoint-576/latest
new file mode 100644
index 0000000000000000000000000000000000000000..1a40031386820b60f3a54acbdbae4813e4a986c7
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-576/latest
@@ -0,0 +1 @@
+global_step576
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-576/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-576/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-64/latest b/dapo_milora_plus_20251201_131939/checkpoint-64/latest
new file mode 100644
index 0000000000000000000000000000000000000000..4a12e7f9029554e8e5ce68ebe3e97d0b4e734304
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-64/latest
@@ -0,0 +1 @@
+global_step64
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-64/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-64/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-64/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/README.md b/dapo_milora_plus_20251201_131939/checkpoint-640/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-640/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-640/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-640/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/latest b/dapo_milora_plus_20251201_131939/checkpoint-640/latest
new file mode 100644
index 0000000000000000000000000000000000000000..4a8906aefa3405aec9d51931707431ef44f4dace
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-640/latest
@@ -0,0 +1 @@
+global_step640
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-640/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-640/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-640/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f57c3a28876bdc73fe6f5aa88ea5d533caac1336
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-640/trainer_state.json
@@ -0,0 +1,19874 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5887764489420423,
+  "eval_steps": 500,
+  "global_step": 640,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 9.941184316630824e-06,
+      "clip_ratio/high_mean": 2.485296079157706e-06,
+      "clip_ratio/low_mean": 2.6134909091979353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8620205910101504e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 8426.1015625,
+      "completions/mean_terminated_length": 7965.72705078125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8188603445887566,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030983765609562397,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 162199765.0,
+      "reward": 0.25,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411106109619,
+      "sampling/importance_sampling_ratio/min": 0.0009119694004766643,
+      "sampling/sampling_logp_difference/max": 6.999904155731201,
+      "sampling/sampling_logp_difference/mean": 0.02070600539445877,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 2.612139087432297e-05,
+      "clip_ratio/high_mean": 6.530347718580742e-06,
+      "clip_ratio/low_mean": 3.7853451885894174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.438379949078808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15904.0,
+      "completions/mean_length": 7154.2109375,
+      "completions/mean_terminated_length": 6856.4755859375,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "entropy": 0.9913735538721085,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003430198412388563,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 163133232.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2120065689086914,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000275373458862,
+      "sampling/importance_sampling_ratio/min": 0.00042929715709760785,
+      "sampling/sampling_logp_difference/max": 7.753361225128174,
+      "sampling/sampling_logp_difference/mean": 0.02190260961651802,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 3.1841454983805306e-06,
+      "clip_ratio/high_mean": 7.960363745951327e-07,
+      "clip_ratio/low_mean": 3.384581600585079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4641852380445926e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 7693.1328125,
+      "completions/mean_terminated_length": 7412.7822265625,
+      "completions/min_length": 1077.0,
+      "completions/min_terminated_length": 1077.0,
+      "entropy": 0.9887127950787544,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002780586015433073,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 164134393.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999028444290161,
+      "sampling/importance_sampling_ratio/min": 3.559096626304381e-07,
+      "sampling/sampling_logp_difference/max": 14.848588943481445,
+      "sampling/sampling_logp_difference/mean": 0.021110571920871735,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 9.770586984814145e-06,
+      "clip_ratio/high_mean": 5.008155312680174e-06,
+      "clip_ratio/low_mean": 5.182203130971175e-05,
+      "clip_ratio/low_min": 1.5574546068819473e-05,
+      "clip_ratio/region_mean": 5.683018616764457e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 7072.1484375,
+      "completions/mean_terminated_length": 6771.76611328125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.861792616546154,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030156150460243225,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 165063412.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998926520347595,
+      "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06,
+      "sampling/sampling_logp_difference/max": 12.999247550964355,
+      "sampling/sampling_logp_difference/mean": 0.019325289875268936,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 2.2510209873871645e-05,
+      "clip_ratio/high_mean": 6.455301331698138e-06,
+      "clip_ratio/low_mean": 6.156819108582567e-05,
+      "clip_ratio/low_min": 5.763157332694391e-06,
+      "clip_ratio/region_mean": 6.802349253121065e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15062.0,
+      "completions/mean_length": 7353.421875,
+      "completions/mean_terminated_length": 7062.11279296875,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "entropy": 0.8961873054504395,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034921523183584213,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 166024306.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.0005124400486238301,
+      "sampling/sampling_logp_difference/max": 7.576326847076416,
+      "sampling/sampling_logp_difference/mean": 0.019593238830566406,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.3040991007073899e-05,
+      "clip_ratio/high_mean": 4.292725350296678e-06,
+      "clip_ratio/low_mean": 5.347559840629401e-05,
+      "clip_ratio/low_min": 6.613406640099129e-06,
+      "clip_ratio/region_mean": 5.776832381343411e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15604.0,
+      "completions/mean_length": 7348.03125,
+      "completions/mean_terminated_length": 6903.63916015625,
+      "completions/min_length": 1619.0,
+      "completions/min_terminated_length": 1619.0,
+      "entropy": 0.824029266834259,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027784397825598717,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 166984982.0,
+      "reward": 0.40625,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 0.0010020677000284195,
+      "sampling/sampling_logp_difference/max": 6.905689716339111,
+      "sampling/sampling_logp_difference/mean": 0.01857386901974678,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 3.330808067403268e-05,
+      "clip_ratio/high_mean": 1.0969530649163062e-05,
+      "clip_ratio/low_mean": 3.2080681648949394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3050211388617754e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16358.0,
+      "completions/mean_length": 7290.4765625,
+      "completions/mean_terminated_length": 6920.82080078125,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8884479627013206,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004110465291887522,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 167936971.0,
+      "reward": 0.4375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06,
+      "sampling/sampling_logp_difference/max": 13.219663619995117,
+      "sampling/sampling_logp_difference/mean": 0.019696572795510292,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 9.77357763076725e-06,
+      "clip_ratio/high_mean": 2.4433944076918124e-06,
+      "clip_ratio/low_mean": 3.466498992565903e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.710838473125477e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 7803.625,
+      "completions/mean_terminated_length": 6833.66943359375,
+      "completions/min_length": 929.0,
+      "completions/min_terminated_length": 929.0,
+      "entropy": 0.8326860442757607,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002410614863038063,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "num_tokens": 168955683.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0008801451185718179,
+      "sampling/sampling_logp_difference/max": 7.035423755645752,
+      "sampling/sampling_logp_difference/mean": 0.018545793369412422,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.4602125929741305e-05,
+      "clip_ratio/high_mean": 3.6505314824353263e-06,
+      "clip_ratio/low_mean": 3.4781527119776e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8432058772741584e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6804.34375,
+      "completions/mean_terminated_length": 6495.322265625,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.9669496119022369,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034376555122435093,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 169845823.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31534504890441895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 1.767780588579626e-08,
+      "sampling/sampling_logp_difference/max": 17.850955963134766,
+      "sampling/sampling_logp_difference/mean": 0.020515555515885353,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 1.5814722473805887e-05,
+      "clip_ratio/high_mean": 3.953680618451472e-06,
+      "clip_ratio/low_mean": 3.574208744794305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9695768407455034e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 6827.9609375,
+      "completions/mean_terminated_length": 6105.23583984375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 0.8833946585655212,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026675171684473753,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 170738210.0,
+      "reward": 0.421875,
+      "reward_std": 0.2698654532432556,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000019907951355,
+      "sampling/importance_sampling_ratio/min": 0.002906275913119316,
+      "sampling/sampling_logp_difference/max": 5.840882778167725,
+      "sampling/sampling_logp_difference/mean": 0.019948139786720276,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.6623121837255894e-05,
+      "clip_ratio/high_mean": 4.1557804593139736e-06,
+      "clip_ratio/low_mean": 6.462372630267055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.877950727357529e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15725.0,
+      "completions/mean_length": 7377.984375,
+      "completions/mean_terminated_length": 7307.07080078125,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8881714344024658,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0039620306342840195,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 171705152.0,
+      "reward": 0.3359375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05,
+      "sampling/sampling_logp_difference/max": 10.614632606506348,
+      "sampling/sampling_logp_difference/mean": 0.01964445412158966,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 9.639111340220552e-06,
+      "clip_ratio/high_mean": 2.409777835055138e-06,
+      "clip_ratio/low_mean": 2.775239624952519e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0162174198267167e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15265.0,
+      "completions/mean_length": 6051.8828125,
+      "completions/mean_terminated_length": 5543.74560546875,
+      "completions/min_length": 819.0,
+      "completions/min_terminated_length": 819.0,
+      "entropy": 0.8851477280259132,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0040458571165800095,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 172501881.0,
+      "reward": 0.4296875,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999410510063171,
+      "sampling/importance_sampling_ratio/min": 0.0021976607386022806,
+      "sampling/sampling_logp_difference/max": 6.120361804962158,
+      "sampling/sampling_logp_difference/mean": 0.01957303285598755,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 9.72708312474424e-06,
+      "clip_ratio/high_mean": 3.529455852913088e-06,
+      "clip_ratio/low_mean": 5.158422732165491e-05,
+      "clip_ratio/low_min": 1.1939961495954776e-05,
+      "clip_ratio/region_mean": 5.5113683174567996e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7830.171875,
+      "completions/mean_terminated_length": 7409.4912109375,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.9070459827780724,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005941574461758137,
+      "learning_rate": 1e-05,
+      "loss": 0.0427,
+      "num_tokens": 173522391.0,
+      "reward": 0.34375,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000017881393433,
+      "sampling/importance_sampling_ratio/min": 0.00011712420382536948,
+      "sampling/sampling_logp_difference/max": 9.052275657653809,
+      "sampling/sampling_logp_difference/mean": 0.021295130252838135,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 5.5543214330100454e-06,
+      "clip_ratio/high_mean": 1.3885803582525114e-06,
+      "clip_ratio/low_mean": 1.718775109793569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8576331683561875e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15443.0,
+      "completions/mean_length": 7520.6796875,
+      "completions/mean_terminated_length": 6769.55078125,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.8843575045466423,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025851845275610685,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 174504534.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 0.00039556476986035705,
+      "sampling/sampling_logp_difference/max": 7.835196018218994,
+      "sampling/sampling_logp_difference/mean": 0.02016005665063858,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 1.0145481155632297e-05,
+      "clip_ratio/high_mean": 2.536370288908074e-06,
+      "clip_ratio/low_mean": 3.617897255026037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.871534295285528e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 7382.1875,
+      "completions/mean_terminated_length": 6861.42138671875,
+      "completions/min_length": 934.0,
+      "completions/min_terminated_length": 934.0,
+      "entropy": 0.916313610970974,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004170550964772701,
+      "learning_rate": 1e-05,
+      "loss": 0.047,
+      "num_tokens": 175472574.0,
+      "reward": 0.46875,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05,
+      "sampling/sampling_logp_difference/max": 10.481352806091309,
+      "sampling/sampling_logp_difference/mean": 0.020749717950820923,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.83663013963087e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.83663013963087e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 6122.453125,
+      "completions/mean_terminated_length": 6041.6533203125,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.8984386026859283,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 176275568.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 7.88934721640544e-06,
+      "sampling/sampling_logp_difference/max": 11.74999713897705,
+      "sampling/sampling_logp_difference/mean": 0.020278753712773323,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 1.4535152331518475e-05,
+      "clip_ratio/high_mean": 3.6337880828796187e-06,
+      "clip_ratio/low_mean": 4.3961883989140915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7595671958333696e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15547.0,
+      "completions/mean_length": 4983.2890625,
+      "completions/mean_terminated_length": 4709.67236328125,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.825260303914547,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004848882555961609,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 176932549.0,
+      "reward": 0.6484375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616146087646,
+      "sampling/importance_sampling_ratio/min": 1.626804078114219e-05,
+      "sampling/sampling_logp_difference/max": 11.026308059692383,
+      "sampling/sampling_logp_difference/mean": 0.017959970980882645,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.1141860795760294e-05,
+      "clip_ratio/high_mean": 2.7854651989400736e-06,
+      "clip_ratio/low_mean": 4.2418692146384274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5204157913758536e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 5766.5234375,
+      "completions/mean_terminated_length": 5511.7041015625,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.9016259610652924,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004749474115669727,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 177691752.0,
+      "reward": 0.5,
+      "reward_std": 0.2738044261932373,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000141859054565,
+      "sampling/importance_sampling_ratio/min": 8.927558155846782e-06,
+      "sampling/sampling_logp_difference/max": 11.626367568969727,
+      "sampling/sampling_logp_difference/mean": 0.019118282943964005,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 5.5243735914700665e-06,
+      "clip_ratio/high_mean": 2.1587275114143267e-06,
+      "clip_ratio/low_mean": 4.609663824339805e-05,
+      "clip_ratio/low_min": 3.983555870945565e-06,
+      "clip_ratio/region_mean": 4.8255366664307076e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15696.0,
+      "completions/mean_length": 6993.671875,
+      "completions/mean_terminated_length": 6768.30419921875,
+      "completions/min_length": 889.0,
+      "completions/min_terminated_length": 889.0,
+      "entropy": 0.9074988812208176,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004418120253831148,
+      "learning_rate": 1e-05,
+      "loss": 0.1135,
+      "num_tokens": 178603454.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000037670135498,
+      "sampling/importance_sampling_ratio/min": 0.0018135923892259598,
+      "sampling/sampling_logp_difference/max": 6.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01957814022898674,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 5.126943051436683e-06,
+      "clip_ratio/high_mean": 1.2817357628591708e-06,
+      "clip_ratio/low_mean": 2.7488794444252562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.877053032079857e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 7445.1328125,
+      "completions/mean_terminated_length": 6849.20849609375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9255013465881348,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00237120408564806,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 179577063.0,
+      "reward": 0.40625,
+      "reward_std": 0.21040897071361542,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725818634033,
+      "sampling/importance_sampling_ratio/min": 9.651589061832055e-05,
+      "sampling/sampling_logp_difference/max": 9.245802879333496,
+      "sampling/sampling_logp_difference/mean": 0.02165937051177025,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.8956294752570102e-05,
+      "clip_ratio/high_mean": 4.7390736881425255e-06,
+      "clip_ratio/low_mean": 2.6486316301088664e-05,
+      "clip_ratio/low_min": 3.516273409331916e-06,
+      "clip_ratio/region_mean": 3.122539010291803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6120.5546875,
+      "completions/mean_terminated_length": 5703.34130859375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8181199952960014,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004715202376246452,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 180380422.0,
+      "reward": 0.5,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999874472618103,
+      "sampling/importance_sampling_ratio/min": 0.004350374918431044,
+      "sampling/sampling_logp_difference/max": 5.437493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018377620726823807,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 5.594843969447538e-06,
+      "clip_ratio/high_mean": 2.376495558564784e-06,
+      "clip_ratio/low_mean": 3.4097628713425365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6474124044616474e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 6351.203125,
+      "completions/mean_terminated_length": 5857.78662109375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.8798654451966286,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003063712501898408,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 181212776.0,
+      "reward": 0.453125,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999946355819702,
+      "sampling/importance_sampling_ratio/min": 7.891544555604924e-06,
+      "sampling/sampling_logp_difference/max": 11.74971866607666,
+      "sampling/sampling_logp_difference/mean": 0.019523698836565018,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.544438988001275e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.544438988001275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14180.0,
+      "completions/mean_length": 6330.046875,
+      "completions/mean_terminated_length": 6170.46044921875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.8319354206323624,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033194730058312416,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 182041910.0,
+      "reward": 0.453125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998994469642639,
+      "sampling/importance_sampling_ratio/min": 0.00010535263572819531,
+      "sampling/sampling_logp_difference/max": 9.158197402954102,
+      "sampling/sampling_logp_difference/mean": 0.018981872126460075,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7156292415165808e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7156292415165808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15982.0,
+      "completions/mean_length": 6665.2890625,
+      "completions/mean_terminated_length": 6351.7822265625,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9336326420307159,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004492956213653088,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 182914843.0,
+      "reward": 0.3828125,
+      "reward_std": 0.14807432889938354,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 0.011399568989872932,
+      "sampling/sampling_logp_difference/max": 4.474179744720459,
+      "sampling/sampling_logp_difference/mean": 0.02088768407702446,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.2495465802639956e-05,
+      "clip_ratio/high_mean": 9.084843100026774e-06,
+      "clip_ratio/low_mean": 5.4809036328151706e-05,
+      "clip_ratio/low_min": 8.953898031904828e-06,
+      "clip_ratio/region_mean": 6.389387954186532e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16064.0,
+      "completions/mean_length": 5393.9140625,
+      "completions/mean_terminated_length": 5039.39501953125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.7864786610007286,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003816079581156373,
+      "learning_rate": 1e-05,
+      "loss": -0.004,
+      "num_tokens": 183628152.0,
+      "reward": 0.546875,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998779892921448,
+      "sampling/importance_sampling_ratio/min": 0.003246711567044258,
+      "sampling/sampling_logp_difference/max": 5.730112552642822,
+      "sampling/sampling_logp_difference/mean": 0.018448319286108017,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 8.638648068881594e-06,
+      "clip_ratio/high_mean": 2.1596620172203984e-06,
+      "clip_ratio/low_mean": 1.6896704778446292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9056366909353528e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 7161.5,
+      "completions/mean_terminated_length": 7015.111328125,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.915394201874733,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003666195785626769,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 184562352.0,
+      "reward": 0.3671875,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00025550799909979105,
+      "sampling/sampling_logp_difference/max": 8.272256851196289,
+      "sampling/sampling_logp_difference/mean": 0.019755780696868896,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 6.424931598303374e-06,
+      "clip_ratio/high_mean": 1.6062328995758435e-06,
+      "clip_ratio/low_mean": 2.49038239417132e-05,
+      "clip_ratio/low_min": 4.00025601265952e-06,
+      "clip_ratio/region_mean": 2.651005689813246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15408.0,
+      "completions/mean_length": 7957.671875,
+      "completions/mean_terminated_length": 7685.8544921875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "entropy": 1.1176252663135529,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025940234772861004,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 185606670.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999893844127655,
+      "sampling/importance_sampling_ratio/min": 0.0007622809498570859,
+      "sampling/sampling_logp_difference/max": 7.179195404052734,
+      "sampling/sampling_logp_difference/mean": 0.02338646724820137,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 1.9903963220713194e-05,
+      "clip_ratio/high_mean": 5.829163114867697e-06,
+      "clip_ratio/low_mean": 4.4742550926457625e-05,
+      "clip_ratio/low_min": 3.5803282116830815e-06,
+      "clip_ratio/region_mean": 5.057171370026481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 7060.6640625,
+      "completions/mean_terminated_length": 6759.9111328125,
+      "completions/min_length": 1460.0,
+      "completions/min_terminated_length": 1460.0,
+      "entropy": 0.9148540124297142,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004315398633480072,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 186526883.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0004585353017318994,
+      "sampling/sampling_logp_difference/max": 7.687473297119141,
+      "sampling/sampling_logp_difference/mean": 0.01967843994498253,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 1.147099328591139e-05,
+      "clip_ratio/high_mean": 2.8677483214778476e-06,
+      "clip_ratio/low_mean": 2.8967988555450574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1835736763241584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15596.0,
+      "completions/mean_length": 6649.6640625,
+      "completions/mean_terminated_length": 6416.04052734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9298559054732323,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030786178540438414,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 187397536.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000005841255188,
+      "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07,
+      "sampling/sampling_logp_difference/max": 14.929608345031738,
+      "sampling/sampling_logp_difference/mean": 0.020215414464473724,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 2.2768570943298982e-05,
+      "clip_ratio/high_mean": 5.692142735824746e-06,
+      "clip_ratio/low_mean": 3.249637484259438e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8188517464732286e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 8292.015625,
+      "completions/mean_terminated_length": 7823.8837890625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.8232023045420647,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002438523108139634,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 188477778.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.005636279005557299,
+      "sampling/sampling_logp_difference/max": 5.178531169891357,
+      "sampling/sampling_logp_difference/mean": 0.018984414637088776,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 2.0840709566982696e-05,
+      "clip_ratio/high_mean": 6.135253556749376e-06,
+      "clip_ratio/low_mean": 2.255633432923787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.869158777230041e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15991.0,
+      "completions/mean_length": 7600.9765625,
+      "completions/mean_terminated_length": 6936.71484375,
+      "completions/min_length": 995.0,
+      "completions/min_terminated_length": 995.0,
+      "entropy": 0.8689917623996735,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004773247055709362,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 189470655.0,
+      "reward": 0.40625,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.001327168894931674,
+      "sampling/sampling_logp_difference/max": 6.624707221984863,
+      "sampling/sampling_logp_difference/mean": 0.018666012212634087,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 9.837458947004052e-06,
+      "clip_ratio/high_mean": 2.459364736751013e-06,
+      "clip_ratio/low_mean": 6.463955219260242e-05,
+      "clip_ratio/low_min": 1.0895145351241808e-05,
+      "clip_ratio/region_mean": 6.70989177251613e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16215.0,
+      "completions/mean_length": 7600.34375,
+      "completions/mean_terminated_length": 6855.96630859375,
+      "completions/min_length": 1335.0,
+      "completions/min_terminated_length": 1335.0,
+      "entropy": 0.7636929750442505,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004298723768442869,
+      "learning_rate": 1e-05,
+      "loss": 0.145,
+      "num_tokens": 190462227.0,
+      "reward": 0.515625,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999310374259949,
+      "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05,
+      "sampling/sampling_logp_difference/max": 9.996363639831543,
+      "sampling/sampling_logp_difference/mean": 0.018035393208265305,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 1.4060602325116633e-05,
+      "clip_ratio/high_mean": 3.5151505812791584e-06,
+      "clip_ratio/low_mean": 2.6516039497437305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.003119024924672e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15151.0,
+      "completions/mean_length": 6512.0,
+      "completions/mean_terminated_length": 6434.267578125,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.9043584689497948,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006741553544998169,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 191312483.0,
+      "reward": 0.484375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 1.778468504198827e-05,
+      "sampling/sampling_logp_difference/max": 10.937172889709473,
+      "sampling/sampling_logp_difference/mean": 0.020878732204437256,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 1.7356085209030425e-05,
+      "clip_ratio/high_mean": 4.339021302257606e-06,
+      "clip_ratio/low_mean": 2.8831826739406097e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.317084781429003e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7178.6875,
+      "completions/mean_terminated_length": 6565.00048828125,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.8899475410580635,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00281486171297729,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 192251235.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2240736484527588,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999714493751526,
+      "sampling/importance_sampling_ratio/min": 9.012543159769848e-05,
+      "sampling/sampling_logp_difference/max": 9.314308166503906,
+      "sampling/sampling_logp_difference/mean": 0.020196784287691116,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.5558084214717383e-05,
+      "clip_ratio/high_mean": 3.889521053679346e-06,
+      "clip_ratio/low_mean": 3.0248688972278615e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.413820991227112e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15501.0,
+      "completions/max_terminated_length": 15501.0,
+      "completions/mean_length": 6602.5625,
+      "completions/mean_terminated_length": 6602.5625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.9266818463802338,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005070593673735857,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193116763.0,
+      "reward": 0.53125,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999746680259705,
+      "sampling/importance_sampling_ratio/min": 2.726537559283315e-06,
+      "sampling/sampling_logp_difference/max": 12.812478065490723,
+      "sampling/sampling_logp_difference/mean": 0.020026464015245438,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.188727416476468e-06,
+      "clip_ratio/high_mean": 1.047181854119117e-06,
+      "clip_ratio/low_mean": 2.959152834591805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.063871008635033e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6818.8828125,
+      "completions/mean_terminated_length": 6430.056640625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.874519519507885,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006362155079841614,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 194007868.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009298324585,
+      "sampling/importance_sampling_ratio/min": 0.0005216691642999649,
+      "sampling/sampling_logp_difference/max": 7.55847692489624,
+      "sampling/sampling_logp_difference/mean": 0.01943325623869896,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 9.645911177358357e-06,
+      "clip_ratio/high_mean": 2.4114777943395893e-06,
+      "clip_ratio/low_mean": 6.821557258263056e-05,
+      "clip_ratio/low_min": 1.7265090718865395e-05,
+      "clip_ratio/region_mean": 7.062705049065698e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14536.0,
+      "completions/mean_length": 5515.625,
+      "completions/mean_terminated_length": 5343.111328125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0683523043990135,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003797185141593218,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "num_tokens": 194735980.0,
+      "reward": 0.421875,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 1.137102216830499e-07,
+      "sampling/sampling_logp_difference/max": 15.989612579345703,
+      "sampling/sampling_logp_difference/mean": 0.02120930328965187,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.1971412252241862e-05,
+      "clip_ratio/high_mean": 5.4928530630604655e-06,
+      "clip_ratio/low_mean": 4.9151800567415194e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4644653801005916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 5853.546875,
+      "completions/mean_terminated_length": 5770.6298828125,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.7975900694727898,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004124365746974945,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 195504882.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000672340393066,
+      "sampling/importance_sampling_ratio/min": 0.0032877910416573286,
+      "sampling/sampling_logp_difference/max": 5.717539310455322,
+      "sampling/sampling_logp_difference/mean": 0.017819223925471306,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 7.066538728395244e-06,
+      "clip_ratio/high_mean": 2.843255515472265e-06,
+      "clip_ratio/low_mean": 5.1467116236381116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.431037175185338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6686.25,
+      "completions/mean_terminated_length": 6532.31787109375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.9018580466508865,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024995009880512953,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 196379306.0,
+      "reward": 0.421875,
+      "reward_std": 0.35824593901634216,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999300837516785,
+      "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05,
+      "sampling/sampling_logp_difference/max": 10.818918228149414,
+      "sampling/sampling_logp_difference/mean": 0.018989525735378265,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 6.652828687947476e-06,
+      "clip_ratio/high_mean": 2.5722979444253724e-06,
+      "clip_ratio/low_mean": 3.699686294567073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95691608900961e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16347.0,
+      "completions/mean_length": 7487.3359375,
+      "completions/mean_terminated_length": 7200.3466796875,
+      "completions/min_length": 1222.0,
+      "completions/min_terminated_length": 1222.0,
+      "entropy": 0.9890001565217972,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004295211285352707,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 197357397.0,
+      "reward": 0.40625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0006548459641635418,
+      "sampling/sampling_logp_difference/max": 7.33111047744751,
+      "sampling/sampling_logp_difference/mean": 0.02209121733903885,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 6.0850939007650595e-06,
+      "clip_ratio/high_mean": 1.5212734751912649e-06,
+      "clip_ratio/low_mean": 2.9443070673096372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0964344205131056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7233.484375,
+      "completions/mean_terminated_length": 6938.30615234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.9683803990483284,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003119673579931259,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 198303795.0,
+      "reward": 0.328125,
+      "reward_std": 0.23014704883098602,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000243186950684,
+      "sampling/importance_sampling_ratio/min": 0.020358745008707047,
+      "sampling/sampling_logp_difference/max": 3.89424467086792,
+      "sampling/sampling_logp_difference/mean": 0.021085180342197418,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.963812095113099e-06,
+      "clip_ratio/high_mean": 1.9909530237782747e-06,
+      "clip_ratio/low_mean": 4.031422963635123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.23051826601295e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15733.0,
+      "completions/mean_length": 6457.78125,
+      "completions/mean_terminated_length": 6300.22265625,
+      "completions/min_length": 850.0,
+      "completions/min_terminated_length": 850.0,
+      "entropy": 0.8881053999066353,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033790848683565855,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 199154735.0,
+      "reward": 0.3828125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998799562454224,
+      "sampling/importance_sampling_ratio/min": 2.872048128210736e-07,
+      "sampling/sampling_logp_difference/max": 15.063070297241211,
+      "sampling/sampling_logp_difference/mean": 0.01950821653008461,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 9.059622016138746e-06,
+      "clip_ratio/high_mean": 3.3430123380639998e-06,
+      "clip_ratio/low_mean": 2.2856192117615137e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6199204512522556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 7904.40625,
+      "completions/mean_terminated_length": 7769.81005859375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9881557524204254,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021492803934961557,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 200185643.0,
+      "reward": 0.359375,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001094341278076,
+      "sampling/importance_sampling_ratio/min": 0.001458622980862856,
+      "sampling/sampling_logp_difference/max": 6.530262470245361,
+      "sampling/sampling_logp_difference/mean": 0.021201875060796738,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 6.9962839006620925e-06,
+      "clip_ratio/high_mean": 1.7490709751655231e-06,
+      "clip_ratio/low_mean": 3.018811844412994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193718976035598e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 7414.4921875,
+      "completions/mean_terminated_length": 7414.4921875,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "entropy": 0.9571134969592094,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037221095990389585,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 201153114.0,
+      "reward": 0.4375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999958872795105,
+      "sampling/importance_sampling_ratio/min": 0.0009130563121289015,
+      "sampling/sampling_logp_difference/max": 6.99871301651001,
+      "sampling/sampling_logp_difference/mean": 0.021356744691729546,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 1.1248092050664127e-05,
+      "clip_ratio/high_mean": 2.8120230126660317e-06,
+      "clip_ratio/low_mean": 5.4354991334548686e-05,
+      "clip_ratio/low_min": 6.868132004456129e-06,
+      "clip_ratio/region_mean": 5.716701480196207e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15835.0,
+      "completions/max_terminated_length": 15835.0,
+      "completions/mean_length": 5955.953125,
+      "completions/mean_terminated_length": 5955.953125,
+      "completions/min_length": 1394.0,
+      "completions/min_terminated_length": 1394.0,
+      "entropy": 0.730999618768692,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006285305600613356,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 201933044.0,
+      "reward": 0.59375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.007535050623118877,
+      "sampling/sampling_logp_difference/max": 4.888189792633057,
+      "sampling/sampling_logp_difference/mean": 0.016975615173578262,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 7.226686648209579e-06,
+      "clip_ratio/high_mean": 3.094216481258627e-06,
+      "clip_ratio/low_mean": 4.66828214484849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.977703792974353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6923.3515625,
+      "completions/mean_terminated_length": 6458.0732421875,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9938417226076126,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005667983554303646,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 202837281.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05,
+      "sampling/sampling_logp_difference/max": 10.402952194213867,
+      "sampling/sampling_logp_difference/mean": 0.022059854120016098,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 5.2318769121484365e-06,
+      "clip_ratio/high_mean": 1.3079692280371091e-06,
+      "clip_ratio/low_mean": 4.239228087499214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3700250216716086e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14726.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 5930.9296875,
+      "completions/mean_terminated_length": 5930.9296875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.8100385963916779,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004052883945405483,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 203614448.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999989926815033,
+      "sampling/importance_sampling_ratio/min": 0.00015170808183029294,
+      "sampling/sampling_logp_difference/max": 8.79355239868164,
+      "sampling/sampling_logp_difference/mean": 0.018519222736358643,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 4.905230980511988e-06,
+      "clip_ratio/high_mean": 1.226307745127997e-06,
+      "clip_ratio/low_mean": 5.500513248080097e-05,
+      "clip_ratio/low_min": 7.924934834591113e-06,
+      "clip_ratio/region_mean": 5.6231440112242126e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14996.0,
+      "completions/mean_length": 6911.1015625,
+      "completions/mean_terminated_length": 6108.3134765625,
+      "completions/min_length": 862.0,
+      "completions/min_terminated_length": 862.0,
+      "entropy": 0.9260227829217911,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004494607914239168,
+      "learning_rate": 1e-05,
+      "loss": 0.0269,
+      "num_tokens": 204518261.0,
+      "reward": 0.4140625,
+      "reward_std": 0.34033796191215515,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998886585235596,
+      "sampling/importance_sampling_ratio/min": 0.0015266009140759706,
+      "sampling/sampling_logp_difference/max": 6.484711647033691,
+      "sampling/sampling_logp_difference/mean": 0.020527629181742668,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 8.293764039990492e-06,
+      "clip_ratio/high_mean": 2.073441009997623e-06,
+      "clip_ratio/low_mean": 4.75325257411896e-05,
+      "clip_ratio/low_min": 3.599504680096288e-06,
+      "clip_ratio/region_mean": 4.960596663750039e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14637.0,
+      "completions/mean_length": 6972.921875,
+      "completions/mean_terminated_length": 6823.5400390625,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 1.0095533654093742,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029451537411659956,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 205433843.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05,
+      "sampling/sampling_logp_difference/max": 10.53177547454834,
+      "sampling/sampling_logp_difference/mean": 0.02013089321553707,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 4.163383164268453e-05,
+      "clip_ratio/high_mean": 1.382379150527413e-05,
+      "clip_ratio/low_mean": 3.86000854177837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2423876240936806e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 6706.6640625,
+      "completions/mean_terminated_length": 6313.2763671875,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 0.8647518903017044,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003371767932549119,
+      "learning_rate": 1e-05,
+      "loss": 0.073,
+      "num_tokens": 206310296.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 2.948181463580113e-05,
+      "sampling/sampling_logp_difference/max": 10.431736946105957,
+      "sampling/sampling_logp_difference/mean": 0.019770190119743347,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4946740381892596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4946740381892596e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 6882.609375,
+      "completions/mean_terminated_length": 6415.32763671875,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "entropy": 1.013342760503292,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016336971893906593,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 207210974.0,
+      "reward": 0.359375,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999210834503174,
+      "sampling/importance_sampling_ratio/min": 0.0013267879839986563,
+      "sampling/sampling_logp_difference/max": 6.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.02139991894364357,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.4866403944324702e-05,
+      "clip_ratio/high_mean": 3.7166009860811755e-06,
+      "clip_ratio/low_mean": 3.938925010515959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.310585177336179e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15203.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 6195.7421875,
+      "completions/mean_terminated_length": 6195.7421875,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.8448907434940338,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005036406684666872,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208021893.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955892562866,
+      "sampling/importance_sampling_ratio/min": 0.0040348549373447895,
+      "sampling/sampling_logp_difference/max": 5.512784957885742,
+      "sampling/sampling_logp_difference/mean": 0.018679853528738022,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.1244883353356272e-05,
+      "clip_ratio/high_mean": 2.811220838339068e-06,
+      "clip_ratio/low_mean": 3.422392001084518e-05,
+      "clip_ratio/low_min": 6.451612989621935e-06,
+      "clip_ratio/region_mean": 3.703514119024476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6829.609375,
+      "completions/mean_terminated_length": 6521.40283203125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.8679579794406891,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029643685556948185,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 208912059.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00038063788088038564,
+      "sampling/sampling_logp_difference/max": 7.873661994934082,
+      "sampling/sampling_logp_difference/mean": 0.018488366156816483,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 2.2700600311509334e-05,
+      "clip_ratio/high_mean": 5.675150077877333e-06,
+      "clip_ratio/low_mean": 3.138338854569156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.705853873725573e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14503.0,
+      "completions/max_terminated_length": 14503.0,
+      "completions/mean_length": 5444.4453125,
+      "completions/mean_terminated_length": 5444.4453125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0460086688399315,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035942886024713516,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "num_tokens": 209627804.0,
+      "reward": 0.484375,
+      "reward_std": 0.338498055934906,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997478723526,
+      "sampling/importance_sampling_ratio/min": 0.03179635480046272,
+      "sampling/sampling_logp_difference/max": 3.4484035968780518,
+      "sampling/sampling_logp_difference/mean": 0.020146891474723816,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.477029400120955e-05,
+      "clip_ratio/high_mean": 4.552578502625693e-06,
+      "clip_ratio/low_mean": 5.265122354103369e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.720380158891203e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16244.0,
+      "completions/mean_length": 7657.390625,
+      "completions/mean_terminated_length": 7152.544921875,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 0.9528728649020195,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0044983453117311,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 210630150.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000007152557373,
+      "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05,
+      "sampling/sampling_logp_difference/max": 10.158285140991211,
+      "sampling/sampling_logp_difference/mean": 0.02131088823080063,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 8.607642712377128e-06,
+      "clip_ratio/high_mean": 2.151910678094282e-06,
+      "clip_ratio/low_mean": 2.2759413695894182e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.491132454451872e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7574.3515625,
+      "completions/mean_terminated_length": 7504.984375,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 1.0009776800870895,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006095650140196085,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 211620355.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 0.0013946897815912962,
+      "sampling/sampling_logp_difference/max": 6.575083255767822,
+      "sampling/sampling_logp_difference/mean": 0.021727774292230606,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.764823082339717e-05,
+      "clip_ratio/high_mean": 5.141430960975413e-06,
+      "clip_ratio/low_mean": 5.936152001595474e-05,
+      "clip_ratio/low_min": 9.155588486464694e-06,
+      "clip_ratio/region_mean": 6.450295177273802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14915.0,
+      "completions/mean_length": 7919.6875,
+      "completions/mean_terminated_length": 7716.54443359375,
+      "completions/min_length": 1517.0,
+      "completions/min_terminated_length": 1517.0,
+      "entropy": 1.0405654236674309,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037038614973425865,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 212654747.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381899833679,
+      "sampling/importance_sampling_ratio/min": 0.0057550109922885895,
+      "sampling/sampling_logp_difference/max": 5.157684326171875,
+      "sampling/sampling_logp_difference/mean": 0.022051017731428146,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.265254240934155e-05,
+      "clip_ratio/high_mean": 3.1631356023353874e-06,
+      "clip_ratio/low_mean": 4.716233138424286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.032546687289141e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 8613.4765625,
+      "completions/mean_terminated_length": 7735.0693359375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.890489287674427,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325607368722558,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 213774584.0,
+      "reward": 0.40625,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000060796737671,
+      "sampling/importance_sampling_ratio/min": 1.670176425250247e-05,
+      "sampling/sampling_logp_difference/max": 10.999996185302734,
+      "sampling/sampling_logp_difference/mean": 0.020002499222755432,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.6404605503339553e-05,
+      "clip_ratio/high_mean": 4.101151375834888e-06,
+      "clip_ratio/low_mean": 3.880500707964529e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2906158682853857e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7324.8984375,
+      "completions/mean_terminated_length": 6473.1884765625,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 0.761004202067852,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038265211042016745,
+      "learning_rate": 1e-05,
+      "loss": 0.0717,
+      "num_tokens": 214728371.0,
+      "reward": 0.515625,
+      "reward_std": 0.32719239592552185,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000168085098267,
+      "sampling/importance_sampling_ratio/min": 0.0003049026126973331,
+      "sampling/sampling_logp_difference/max": 8.095518112182617,
+      "sampling/sampling_logp_difference/mean": 0.018367979675531387,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 5.624549885396846e-06,
+      "clip_ratio/high_mean": 1.4061374713492114e-06,
+      "clip_ratio/low_mean": 3.6433707123251224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7839844594600436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14167.0,
+      "completions/max_terminated_length": 14167.0,
+      "completions/mean_length": 6422.0859375,
+      "completions/mean_terminated_length": 6422.0859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.9946094751358032,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002729539293795824,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 215570806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.026308411732316017,
+      "sampling/sampling_logp_difference/max": 3.637866497039795,
+      "sampling/sampling_logp_difference/mean": 0.021903935819864273,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 7.2379848461423535e-06,
+      "clip_ratio/high_mean": 1.8094962115355884e-06,
+      "clip_ratio/low_mean": 3.17277934982485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353728982347093e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15585.0,
+      "completions/mean_length": 6845.2890625,
+      "completions/mean_terminated_length": 6693.88134765625,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8822609707713127,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004974282346665859,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 216465635.0,
+      "reward": 0.5390625,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 8.749838889343664e-05,
+      "sampling/sampling_logp_difference/max": 9.343890190124512,
+      "sampling/sampling_logp_difference/mean": 0.019389234483242035,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.58592818024772e-05,
+      "clip_ratio/high_mean": 3.9648204506193e-06,
+      "clip_ratio/low_mean": 4.096964960353944e-05,
+      "clip_ratio/low_min": 1.7403560605089297e-05,
+      "clip_ratio/region_mean": 4.49344687467601e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 7805.484375,
+      "completions/mean_terminated_length": 7528.7578125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.9977599084377289,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0033159854356199503,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 217485089.0,
+      "reward": 0.421875,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999412298202515,
+      "sampling/importance_sampling_ratio/min": 7.967943383846432e-05,
+      "sampling/sampling_logp_difference/max": 9.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.021925684064626694,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.8265397557115648e-05,
+      "clip_ratio/high_mean": 4.566349389278912e-06,
+      "clip_ratio/low_mean": 4.044636898470344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5012717691861326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15681.0,
+      "completions/mean_length": 7737.5546875,
+      "completions/mean_terminated_length": 7530.04052734375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.8667014688253403,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034952745772898197,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "num_tokens": 218496040.0,
+      "reward": 0.453125,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999128580093384,
+      "sampling/importance_sampling_ratio/min": 6.726370338583365e-05,
+      "sampling/sampling_logp_difference/max": 9.606889724731445,
+      "sampling/sampling_logp_difference/mean": 0.019742710515856743,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 8.244294804171659e-06,
+      "clip_ratio/high_mean": 2.0610737010429148e-06,
+      "clip_ratio/low_mean": 3.204250072030845e-05,
+      "clip_ratio/low_min": 3.323495775475749e-06,
+      "clip_ratio/region_mean": 3.410357436450795e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15858.0,
+      "completions/mean_length": 7365.84375,
+      "completions/mean_terminated_length": 6601.59326171875,
+      "completions/min_length": 744.0,
+      "completions/min_terminated_length": 744.0,
+      "entropy": 0.8151945173740387,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038676802068948746,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 219459140.0,
+      "reward": 0.46875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00023387260443996638,
+      "sampling/sampling_logp_difference/max": 8.360733985900879,
+      "sampling/sampling_logp_difference/mean": 0.018882082775235176,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 6.87833608026267e-06,
+      "clip_ratio/high_mean": 2.9462287329806713e-06,
+      "clip_ratio/low_mean": 5.435333650893881e-05,
+      "clip_ratio/low_min": 5.33937054569833e-06,
+      "clip_ratio/region_mean": 5.729956546929316e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 6448.0078125,
+      "completions/mean_terminated_length": 6369.771484375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9546648040413857,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004310046322643757,
+      "learning_rate": 1e-05,
+      "loss": 0.1082,
+      "num_tokens": 220304605.0,
+      "reward": 0.5703125,
+      "reward_std": 0.35611939430236816,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999396800994873,
+      "sampling/importance_sampling_ratio/min": 0.0001234127557836473,
+      "sampling/sampling_logp_difference/max": 8.99997615814209,
+      "sampling/sampling_logp_difference/mean": 0.020253397524356842,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 6.196094091137638e-06,
+      "clip_ratio/high_mean": 1.5490235227844096e-06,
+      "clip_ratio/low_mean": 2.5416685957679874e-05,
+      "clip_ratio/low_min": 5.5736391004757024e-06,
+      "clip_ratio/region_mean": 2.696570959415112e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 7457.6484375,
+      "completions/mean_terminated_length": 6941.24755859375,
+      "completions/min_length": 604.0,
+      "completions/min_terminated_length": 604.0,
+      "entropy": 0.8182889074087143,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026646999176591635,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 221281968.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2012200653553009,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173283576965,
+      "sampling/importance_sampling_ratio/min": 2.902353571698768e-06,
+      "sampling/sampling_logp_difference/max": 12.749988555908203,
+      "sampling/sampling_logp_difference/mean": 0.019208962097764015,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 1.6189535017474554e-05,
+      "clip_ratio/high_mean": 4.047383754368639e-06,
+      "clip_ratio/low_mean": 3.127787306311802e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.532525670379982e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 8561.109375,
+      "completions/mean_terminated_length": 7969.79052734375,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.9581378549337387,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016026750672608614,
+      "learning_rate": 1e-05,
+      "loss": 0.0131,
+      "num_tokens": 222399046.0,
+      "reward": 0.34375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 1.653693971093162e-06,
+      "sampling/sampling_logp_difference/max": 13.312499046325684,
+      "sampling/sampling_logp_difference/mean": 0.02173236384987831,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 1.4200771602190798e-05,
+      "clip_ratio/high_mean": 4.3255887476334465e-06,
+      "clip_ratio/low_mean": 5.2955770115659107e-05,
+      "clip_ratio/low_min": 3.402656830076012e-06,
+      "clip_ratio/region_mean": 5.7281358749605715e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16239.0,
+      "completions/mean_length": 7152.34375,
+      "completions/mean_terminated_length": 7079.6533203125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9052041247487068,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005460259038954973,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 223335010.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3356297016143799,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999966621398926,
+      "sampling/importance_sampling_ratio/min": 0.010161337442696095,
+      "sampling/sampling_logp_difference/max": 4.589165210723877,
+      "sampling/sampling_logp_difference/mean": 0.01986619457602501,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 1.4350314813782461e-05,
+      "clip_ratio/high_mean": 3.5875787034456152e-06,
+      "clip_ratio/low_mean": 3.81288905373367e-05,
+      "clip_ratio/low_min": 8.099272235995159e-06,
+      "clip_ratio/region_mean": 4.1716469809216505e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 6678.65625,
+      "completions/mean_terminated_length": 6524.603515625,
+      "completions/min_length": 963.0,
+      "completions/min_terminated_length": 963.0,
+      "entropy": 0.9043187350034714,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005933742038905621,
+      "learning_rate": 1e-05,
+      "loss": 0.0966,
+      "num_tokens": 224207006.0,
+      "reward": 0.484375,
+      "reward_std": 0.3316681981086731,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000031590461731,
+      "sampling/importance_sampling_ratio/min": 0.0011734943836927414,
+      "sampling/sampling_logp_difference/max": 6.747769355773926,
+      "sampling/sampling_logp_difference/mean": 0.019827336072921753,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 1.6498819377375185e-05,
+      "clip_ratio/high_mean": 4.124704844343796e-06,
+      "clip_ratio/low_mean": 3.601791678420341e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.014262168539062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6999.0390625,
+      "completions/mean_terminated_length": 6850.07177734375,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8109970837831497,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003635740838944912,
+      "learning_rate": 1e-05,
+      "loss": 0.104,
+      "num_tokens": 225122891.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999303817749023,
+      "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05,
+      "sampling/sampling_logp_difference/max": 10.987512588500977,
+      "sampling/sampling_logp_difference/mean": 0.018912551924586296,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 9.527577958579059e-06,
+      "clip_ratio/high_mean": 2.3818944896447647e-06,
+      "clip_ratio/low_mean": 3.766565987461945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.004755419373396e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15713.0,
+      "completions/mean_length": 7483.7109375,
+      "completions/mean_terminated_length": 7045.9912109375,
+      "completions/min_length": 1153.0,
+      "completions/min_terminated_length": 1153.0,
+      "entropy": 0.9473970532417297,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003405241761356592,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 226102462.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00002920627594,
+      "sampling/importance_sampling_ratio/min": 0.00525119062513113,
+      "sampling/sampling_logp_difference/max": 5.249300479888916,
+      "sampling/sampling_logp_difference/mean": 0.021076779812574387,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.5867321963014547e-05,
+      "clip_ratio/high_mean": 3.966830490753637e-06,
+      "clip_ratio/low_mean": 3.8259706570897833e-05,
+      "clip_ratio/low_min": 3.549019083948224e-06,
+      "clip_ratio/region_mean": 4.2226537743772496e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 7569.03125,
+      "completions/mean_terminated_length": 7357.47216796875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9231455475091934,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025927501264959574,
+      "learning_rate": 1e-05,
+      "loss": 0.0801,
+      "num_tokens": 227093562.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19097033143043518,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0052477638237178326,
+      "sampling/sampling_logp_difference/max": 5.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.020578444004058838,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.344091060673236e-05,
+      "clip_ratio/high_mean": 3.36022765168309e-06,
+      "clip_ratio/low_mean": 4.253613235505327e-05,
+      "clip_ratio/low_min": 3.5579084851633525e-06,
+      "clip_ratio/region_mean": 4.5896360120423196e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 7589.2734375,
+      "completions/mean_terminated_length": 7378.2001953125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9265239909291267,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030512227676808834,
+      "learning_rate": 1e-05,
+      "loss": 0.04,
+      "num_tokens": 228086405.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0002165911573683843,
+      "sampling/sampling_logp_difference/max": 8.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.020208362489938736,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.9613525410022703e-05,
+      "clip_ratio/high_mean": 4.903381352505676e-06,
+      "clip_ratio/low_mean": 3.184792547017423e-05,
+      "clip_ratio/low_min": 7.29296516510658e-06,
+      "clip_ratio/region_mean": 3.675130722058384e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 8420.6875,
+      "completions/mean_terminated_length": 8096.97509765625,
+      "completions/min_length": 1114.0,
+      "completions/min_terminated_length": 1114.0,
+      "entropy": 0.9572964608669281,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022430522367358208,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 229183765.0,
+      "reward": 0.34375,
+      "reward_std": 0.309583842754364,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 0.00029693738906644285,
+      "sampling/sampling_logp_difference/max": 8.121989250183105,
+      "sampling/sampling_logp_difference/mean": 0.021570362150669098,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.728750577167375e-06,
+      "clip_ratio/high_mean": 1.6821876442918438e-06,
+      "clip_ratio/low_mean": 2.1682553096979973e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.336474062758498e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15736.0,
+      "completions/mean_length": 6809.765625,
+      "completions/mean_terminated_length": 6579.984375,
+      "completions/min_length": 860.0,
+      "completions/min_terminated_length": 860.0,
+      "entropy": 0.884086549282074,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004295065999031067,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 230077607.0,
+      "reward": 0.484375,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00754612497985363,
+      "sampling/sampling_logp_difference/max": 4.886721134185791,
+      "sampling/sampling_logp_difference/mean": 0.019895706325769424,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 2.8609347509700456e-05,
+      "clip_ratio/high_mean": 7.152336877425114e-06,
+      "clip_ratio/low_mean": 5.158006410965754e-05,
+      "clip_ratio/low_min": 5.210069957684027e-06,
+      "clip_ratio/region_mean": 5.873240070286556e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15080.0,
+      "completions/mean_length": 7340.6953125,
+      "completions/mean_terminated_length": 6973.0810546875,
+      "completions/min_length": 1616.0,
+      "completions/min_terminated_length": 1616.0,
+      "entropy": 0.9920620769262314,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004631794057786465,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 231035616.0,
+      "reward": 0.4375,
+      "reward_std": 0.3235401213169098,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337792396545,
+      "sampling/importance_sampling_ratio/min": 0.0002508950710762292,
+      "sampling/sampling_logp_difference/max": 8.290475845336914,
+      "sampling/sampling_logp_difference/mean": 0.020591016858816147,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.3085940774290066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3085940774290066e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14120.0,
+      "completions/mean_length": 6748.875,
+      "completions/mean_terminated_length": 6595.93701171875,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.9867061004042625,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035752104595303535,
+      "learning_rate": 1e-05,
+      "loss": 0.0455,
+      "num_tokens": 231920056.0,
+      "reward": 0.40625,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999653100967407,
+      "sampling/importance_sampling_ratio/min": 0.0003869794018100947,
+      "sampling/sampling_logp_difference/max": 7.8571391105651855,
+      "sampling/sampling_logp_difference/mean": 0.02061416581273079,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 1.2506750408647349e-05,
+      "clip_ratio/high_mean": 3.1266876021618373e-06,
+      "clip_ratio/low_mean": 3.10397430212106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.416643085074611e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 7260.3046875,
+      "completions/mean_terminated_length": 7188.46435546875,
+      "completions/min_length": 1384.0,
+      "completions/min_terminated_length": 1384.0,
+      "entropy": 1.0388494208455086,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036644963547587395,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 232869159.0,
+      "reward": 0.390625,
+      "reward_std": 0.2359209954738617,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999546408653259,
+      "sampling/importance_sampling_ratio/min": 0.0008660226594656706,
+      "sampling/sampling_logp_difference/max": 7.051599502563477,
+      "sampling/sampling_logp_difference/mean": 0.02120530977845192,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.704355301830219e-05,
+      "clip_ratio/high_mean": 6.760888254575548e-06,
+      "clip_ratio/low_mean": 3.1861192269388994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.862208097871189e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16073.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 6354.4609375,
+      "completions/mean_terminated_length": 6354.4609375,
+      "completions/min_length": 1035.0,
+      "completions/min_terminated_length": 1035.0,
+      "entropy": 0.8405331820249557,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004709267523139715,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 233702842.0,
+      "reward": 0.546875,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 0.0046309432946145535,
+      "sampling/sampling_logp_difference/max": 5.37499475479126,
+      "sampling/sampling_logp_difference/mean": 0.019126038998365402,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 9.749228638611385e-06,
+      "clip_ratio/high_mean": 2.437307159652846e-06,
+      "clip_ratio/low_mean": 3.855073941849696e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.098804652130639e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16026.0,
+      "completions/mean_length": 6514.578125,
+      "completions/mean_terminated_length": 6357.9208984375,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "entropy": 1.0254098922014236,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003066045930609107,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 234556348.0,
+      "reward": 0.4375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805092811584,
+      "sampling/importance_sampling_ratio/min": 0.005210204049944878,
+      "sampling/sampling_logp_difference/max": 5.257136344909668,
+      "sampling/sampling_logp_difference/mean": 0.019960148259997368,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.0475813724042382e-05,
+      "clip_ratio/high_mean": 2.6189534310105955e-06,
+      "clip_ratio/low_mean": 3.487835761006863e-05,
+      "clip_ratio/low_min": 2.9392399483185727e-06,
+      "clip_ratio/region_mean": 3.749731081370555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 7379.5546875,
+      "completions/mean_terminated_length": 7236.62744140625,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 1.0397320613265038,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005132520105689764,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 235521091.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519364118576,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999256134033203,
+      "sampling/importance_sampling_ratio/min": 0.00016659013635944575,
+      "sampling/sampling_logp_difference/max": 8.699974060058594,
+      "sampling/sampling_logp_difference/mean": 0.021417103707790375,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.9904123973901733e-05,
+      "clip_ratio/high_mean": 5.776861314643611e-06,
+      "clip_ratio/low_mean": 2.6659268655748747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2436129686175263e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14565.0,
+      "completions/mean_length": 7837.1640625,
+      "completions/mean_terminated_length": 7632.04052734375,
+      "completions/min_length": 1346.0,
+      "completions/min_terminated_length": 1346.0,
+      "entropy": 0.8400963917374611,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028969801496714354,
+      "learning_rate": 1e-05,
+      "loss": 0.0143,
+      "num_tokens": 236544160.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887943267822,
+      "sampling/importance_sampling_ratio/min": 2.883308241052873e-07,
+      "sampling/sampling_logp_difference/max": 15.059157371520996,
+      "sampling/sampling_logp_difference/mean": 0.019267702475190163,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 8.562770290154731e-06,
+      "clip_ratio/high_mean": 2.1406925725386827e-06,
+      "clip_ratio/low_mean": 4.060094340729847e-05,
+      "clip_ratio/low_min": 3.8700886761944275e-06,
+      "clip_ratio/region_mean": 4.2741635979837156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15350.0,
+      "completions/mean_length": 6696.3515625,
+      "completions/mean_terminated_length": 6542.57958984375,
+      "completions/min_length": 1239.0,
+      "completions/min_terminated_length": 1239.0,
+      "entropy": 0.8495818004012108,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003412836929783225,
+      "learning_rate": 1e-05,
+      "loss": 0.0803,
+      "num_tokens": 237423101.0,
+      "reward": 0.515625,
+      "reward_std": 0.37981897592544556,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.012152798473834991,
+      "sampling/sampling_logp_difference/max": 4.410195827484131,
+      "sampling/sampling_logp_difference/mean": 0.018458625301718712,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 1.1463653436294408e-05,
+      "clip_ratio/high_mean": 3.646129641765583e-06,
+      "clip_ratio/low_mean": 6.144847083078275e-05,
+      "clip_ratio/low_min": 1.110105540647055e-05,
+      "clip_ratio/region_mean": 6.509460160941671e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15666.0,
+      "completions/mean_length": 7700.3671875,
+      "completions/mean_terminated_length": 7121.45849609375,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "entropy": 0.8258870914578438,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024443145375698805,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 238429956.0,
+      "reward": 0.375,
+      "reward_std": 0.2872493863105774,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999113082885742,
+      "sampling/importance_sampling_ratio/min": 0.00026112530031241477,
+      "sampling/sampling_logp_difference/max": 8.250510215759277,
+      "sampling/sampling_logp_difference/mean": 0.019427984952926636,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 4.218127742205979e-06,
+      "clip_ratio/high_mean": 1.0545319355514948e-06,
+      "clip_ratio/low_mean": 1.7289162997258245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.834369493280974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16112.0,
+      "completions/mean_length": 6255.21875,
+      "completions/mean_terminated_length": 6094.44482421875,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.8179014846682549,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022747826296836138,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 239250160.0,
+      "reward": 0.5234375,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.0002633975527714938,
+      "sampling/sampling_logp_difference/max": 8.241846084594727,
+      "sampling/sampling_logp_difference/mean": 0.018723051995038986,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 1.698448841125355e-05,
+      "clip_ratio/high_mean": 5.369374321162468e-06,
+      "clip_ratio/low_mean": 6.14647315160255e-05,
+      "clip_ratio/low_min": 5.043576493335422e-06,
+      "clip_ratio/region_mean": 6.683410583718796e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15321.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6914.9609375,
+      "completions/mean_terminated_length": 6914.9609375,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9700981751084328,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005685295443981886,
+      "learning_rate": 1e-05,
+      "loss": -0.0056,
+      "num_tokens": 240156211.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998887777328491,
+      "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05,
+      "sampling/sampling_logp_difference/max": 9.997581481933594,
+      "sampling/sampling_logp_difference/mean": 0.021195171400904655,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9186837764427764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9186837764427764e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15469.0,
+      "completions/mean_length": 5227.53125,
+      "completions/mean_terminated_length": 5139.68505859375,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.9116031974554062,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003880272386595607,
+      "learning_rate": 1e-05,
+      "loss": 0.1246,
+      "num_tokens": 240845295.0,
+      "reward": 0.6328125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000362396240234,
+      "sampling/importance_sampling_ratio/min": 0.00012422871077433228,
+      "sampling/sampling_logp_difference/max": 8.993386268615723,
+      "sampling/sampling_logp_difference/mean": 0.018801718950271606,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 2.5015486926349695e-05,
+      "clip_ratio/high_mean": 8.084949570275057e-06,
+      "clip_ratio/low_mean": 5.524710468307603e-05,
+      "clip_ratio/low_min": 3.776891389861703e-06,
+      "clip_ratio/region_mean": 6.333205465125502e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 8065.4765625,
+      "completions/mean_terminated_length": 7510.90869140625,
+      "completions/min_length": 1055.0,
+      "completions/min_terminated_length": 1055.0,
+      "entropy": 0.7446574792265892,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028986844699829817,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 241895676.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3474721610546112,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999842643737793,
+      "sampling/importance_sampling_ratio/min": 0.0017039099475368857,
+      "sampling/sampling_logp_difference/max": 6.3748297691345215,
+      "sampling/sampling_logp_difference/mean": 0.01853121444582939,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 9.486341014053323e-06,
+      "clip_ratio/high_mean": 2.371585253513331e-06,
+      "clip_ratio/low_mean": 2.896106741445692e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.133265261112683e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15534.0,
+      "completions/max_terminated_length": 15534.0,
+      "completions/mean_length": 6127.359375,
+      "completions/mean_terminated_length": 6127.359375,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.8569132760167122,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003845847910270095,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 242698258.0,
+      "reward": 0.53125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000942945480347,
+      "sampling/importance_sampling_ratio/min": 0.00043231461313553154,
+      "sampling/sampling_logp_difference/max": 7.746356964111328,
+      "sampling/sampling_logp_difference/mean": 0.01856958493590355,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 2.9848330086679198e-05,
+      "clip_ratio/high_mean": 7.4620825216697995e-06,
+      "clip_ratio/low_mean": 4.3558867673709756e-05,
+      "clip_ratio/low_min": 4.417741820361698e-06,
+      "clip_ratio/region_mean": 5.1020949285884853e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15192.0,
+      "completions/mean_length": 6600.1484375,
+      "completions/mean_terminated_length": 6365.33642578125,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.78924310952425,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003953634761273861,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 243560957.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.0006525487406179309,
+      "sampling/sampling_logp_difference/max": 7.334624767303467,
+      "sampling/sampling_logp_difference/mean": 0.018097909167408943,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 6.635561703660642e-06,
+      "clip_ratio/high_mean": 1.6588904259151604e-06,
+      "clip_ratio/low_mean": 2.737523408313791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9034124281679397e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7852.171875,
+      "completions/mean_terminated_length": 7852.171875,
+      "completions/min_length": 1276.0,
+      "completions/min_terminated_length": 1276.0,
+      "entropy": 1.0598893761634827,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00360781978815794,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 244585923.0,
+      "reward": 0.3125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05,
+      "sampling/sampling_logp_difference/max": 10.076086044311523,
+      "sampling/sampling_logp_difference/mean": 0.022330068051815033,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 3.1540168947685743e-06,
+      "clip_ratio/high_mean": 7.885042236921436e-07,
+      "clip_ratio/low_mean": 4.7973388973332476e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.876189268543385e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7972.2265625,
+      "completions/mean_terminated_length": 7700.87890625,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.933217465877533,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0027661293279379606,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 245628064.0,
+      "reward": 0.28125,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05,
+      "sampling/sampling_logp_difference/max": 10.366576194763184,
+      "sampling/sampling_logp_difference/mean": 0.021125148981809616,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.2965969062861404e-05,
+      "clip_ratio/high_mean": 3.241492265715351e-06,
+      "clip_ratio/low_mean": 4.6317693090713874e-05,
+      "clip_ratio/low_min": 3.820877282123547e-06,
+      "clip_ratio/region_mean": 4.955918507221213e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15744.0,
+      "completions/mean_length": 7135.6953125,
+      "completions/mean_terminated_length": 6913.736328125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 0.7786942347884178,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005680318456143141,
+      "learning_rate": 1e-05,
+      "loss": 0.0786,
+      "num_tokens": 246561329.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999462366104126,
+      "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05,
+      "sampling/sampling_logp_difference/max": 9.737424850463867,
+      "sampling/sampling_logp_difference/mean": 0.018504241481423378,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.22437145175536e-05,
+      "clip_ratio/low_min": 1.4025082009538892e-05,
+      "clip_ratio/region_mean": 4.22437145175536e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 6704.046875,
+      "completions/mean_terminated_length": 6627.82666015625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "entropy": 1.0435140281915665,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026402862276881933,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 247437415.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31276631355285645,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998904466629028,
+      "sampling/importance_sampling_ratio/min": 0.0007800163584761322,
+      "sampling/sampling_logp_difference/max": 7.156195640563965,
+      "sampling/sampling_logp_difference/mean": 0.02134273201227188,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 2.223430897174694e-05,
+      "clip_ratio/high_mean": 6.8746438159905665e-06,
+      "clip_ratio/low_mean": 4.7084630978133646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3959275192028144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 5892.5078125,
+      "completions/mean_terminated_length": 5725.9765625,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "entropy": 0.8004944771528244,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003993614576756954,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 248211112.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0024652592837810516,
+      "sampling/sampling_logp_difference/max": 6.005458354949951,
+      "sampling/sampling_logp_difference/mean": 0.01924925297498703,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 2.1833082200828358e-05,
+      "clip_ratio/high_mean": 5.458270550207089e-06,
+      "clip_ratio/low_mean": 3.415995615796419e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961822596920683e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 7812.140625,
+      "completions/mean_terminated_length": 7316.24755859375,
+      "completions/min_length": 1515.0,
+      "completions/min_terminated_length": 1515.0,
+      "entropy": 0.8841542899608612,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001573400106281042,
+      "learning_rate": 1e-05,
+      "loss": 0.0823,
+      "num_tokens": 249228106.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 0.001001527882181108,
+      "sampling/sampling_logp_difference/max": 6.906228542327881,
+      "sampling/sampling_logp_difference/mean": 0.01956877112388611,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 1.014439021673752e-05,
+      "clip_ratio/high_mean": 2.53609755418438e-06,
+      "clip_ratio/low_mean": 3.068193461785995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.321803217204433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 6372.953125,
+      "completions/mean_terminated_length": 6132.6884765625,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.8228401988744736,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021125099156051874,
+      "learning_rate": 1e-05,
+      "loss": 0.0438,
+      "num_tokens": 250063284.0,
+      "reward": 0.5,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05,
+      "sampling/sampling_logp_difference/max": 9.937475204467773,
+      "sampling/sampling_logp_difference/mean": 0.01943521574139595,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 7.023906164249638e-06,
+      "clip_ratio/high_mean": 1.7559765410624095e-06,
+      "clip_ratio/low_mean": 2.526416994896863e-05,
+      "clip_ratio/low_min": 6.7760895490209805e-06,
+      "clip_ratio/region_mean": 2.7020146660561295e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16270.0,
+      "completions/mean_length": 7817.8671875,
+      "completions/mean_terminated_length": 7396.58154296875,
+      "completions/min_length": 1568.0,
+      "completions/min_terminated_length": 1568.0,
+      "entropy": 0.9454319775104523,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022315154783427715,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 251085123.0,
+      "reward": 0.40625,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06,
+      "sampling/sampling_logp_difference/max": 12.760490417480469,
+      "sampling/sampling_logp_difference/mean": 0.021764669567346573,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 1.4797966287005693e-05,
+      "clip_ratio/high_mean": 3.699491571751423e-06,
+      "clip_ratio/low_mean": 4.36271948274225e-05,
+      "clip_ratio/low_min": 3.6957101201551268e-06,
+      "clip_ratio/region_mean": 4.732668639917392e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 7168.4921875,
+      "completions/mean_terminated_length": 6635.36328125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8433891162276268,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 252020906.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589920043945,
+      "sampling/importance_sampling_ratio/min": 0.0003851866349577904,
+      "sampling/sampling_logp_difference/max": 7.861782550811768,
+      "sampling/sampling_logp_difference/mean": 0.01929781585931778,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 1.996871560550062e-05,
+      "clip_ratio/high_mean": 6.089093403716106e-06,
+      "clip_ratio/low_mean": 4.2792244585143635e-05,
+      "clip_ratio/low_min": 1.0337215371691855e-05,
+      "clip_ratio/region_mean": 4.8881338216233416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7322.5078125,
+      "completions/mean_terminated_length": 6876.8603515625,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 0.9157031401991844,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036942458245903254,
+      "learning_rate": 1e-05,
+      "loss": 0.079,
+      "num_tokens": 252977435.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24275577068328857,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999804496765137,
+      "sampling/importance_sampling_ratio/min": 0.00029605376766994596,
+      "sampling/sampling_logp_difference/max": 8.124969482421875,
+      "sampling/sampling_logp_difference/mean": 0.0205365102738142,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.631919460327481e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.631919460327481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16078.0,
+      "completions/mean_length": 7025.484375,
+      "completions/mean_terminated_length": 6723.5966796875,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 1.1329731941223145,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034127074759453535,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 253896161.0,
+      "reward": 0.25,
+      "reward_std": 0.27722424268722534,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0005197672289796174,
+      "sampling/sampling_logp_difference/max": 7.562129497528076,
+      "sampling/sampling_logp_difference/mean": 0.023741140961647034,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 4.368643658381188e-06,
+      "clip_ratio/high_mean": 1.092160914595297e-06,
+      "clip_ratio/low_mean": 2.4661783299961826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5753944555617636e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13776.0,
+      "completions/mean_length": 5996.1796875,
+      "completions/mean_terminated_length": 5661.08837890625,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8773328885436058,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003959407564252615,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 254690264.0,
+      "reward": 0.53125,
+      "reward_std": 0.26645541191101074,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07,
+      "sampling/sampling_logp_difference/max": 15.73043155670166,
+      "sampling/sampling_logp_difference/mean": 0.018407585099339485,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.616483677935321e-05,
+      "clip_ratio/high_mean": 4.041209194838302e-06,
+      "clip_ratio/low_mean": 3.736187466074625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140308453770558e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16383.0,
+      "completions/mean_length": 7165.328125,
+      "completions/mean_terminated_length": 6867.951171875,
+      "completions/min_length": 1115.0,
+      "completions/min_terminated_length": 1115.0,
+      "entropy": 0.9502597972750664,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030910037457942963,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 255626394.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000731945037842,
+      "sampling/importance_sampling_ratio/min": 0.00022311302018351853,
+      "sampling/sampling_logp_difference/max": 8.407832145690918,
+      "sampling/sampling_logp_difference/mean": 0.020668907091021538,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.1702686606440693e-05,
+      "clip_ratio/high_mean": 2.9256716516101733e-06,
+      "clip_ratio/low_mean": 5.5247357522603124e-05,
+      "clip_ratio/low_min": 3.6811261452385224e-06,
+      "clip_ratio/region_mean": 5.8173028264718596e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15375.0,
+      "completions/mean_length": 8001.9296875,
+      "completions/mean_terminated_length": 7661.34912109375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "entropy": 0.8591345250606537,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037233952898532152,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 256673457.0,
+      "reward": 0.421875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999151229858398,
+      "sampling/importance_sampling_ratio/min": 0.0021876997780054808,
+      "sampling/sampling_logp_difference/max": 6.124904632568359,
+      "sampling/sampling_logp_difference/mean": 0.020540472120046616,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 3.721341136042611e-05,
+      "clip_ratio/high_mean": 1.2759249216287571e-05,
+      "clip_ratio/low_mean": 3.570647322703735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.846572301175911e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 6924.84375,
+      "completions/mean_terminated_length": 6697.82421875,
+      "completions/min_length": 803.0,
+      "completions/min_terminated_length": 803.0,
+      "entropy": 0.7969356626272202,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006054217461496592,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 257578501.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.007889713160693645,
+      "sampling/sampling_logp_difference/max": 4.842195510864258,
+      "sampling/sampling_logp_difference/mean": 0.019306108355522156,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.0211543894911301e-05,
+      "clip_ratio/high_mean": 2.5528859737278253e-06,
+      "clip_ratio/low_mean": 5.2388056587915344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4940942732173426e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14439.0,
+      "completions/mean_length": 6203.03125,
+      "completions/mean_terminated_length": 5958.6884765625,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "entropy": 0.8734413683414459,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004903806839138269,
+      "learning_rate": 1e-05,
+      "loss": 0.0689,
+      "num_tokens": 258392625.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999826550483704,
+      "sampling/importance_sampling_ratio/min": 0.00020370795391499996,
+      "sampling/sampling_logp_difference/max": 8.498823165893555,
+      "sampling/sampling_logp_difference/mean": 0.01909301057457924,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.5135058674786706e-05,
+      "clip_ratio/high_mean": 4.64845766146027e-06,
+      "clip_ratio/low_mean": 4.373456977191381e-05,
+      "clip_ratio/low_min": 3.670856358439778e-06,
+      "clip_ratio/region_mean": 4.8383026296505705e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 7982.5390625,
+      "completions/mean_terminated_length": 7641.01611328125,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0091779381036758,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033637424930930138,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 259435270.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999765753746033,
+      "sampling/importance_sampling_ratio/min": 0.0016514655435457826,
+      "sampling/sampling_logp_difference/max": 6.406092166900635,
+      "sampling/sampling_logp_difference/mean": 0.02182736061513424,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 2.3964702677403693e-05,
+      "clip_ratio/high_mean": 5.991175669350923e-06,
+      "clip_ratio/low_mean": 5.2442986770984135e-05,
+      "clip_ratio/low_min": 8.75736759553547e-06,
+      "clip_ratio/region_mean": 5.843416238349164e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6915.3125,
+      "completions/mean_terminated_length": 6688.064453125,
+      "completions/min_length": 778.0,
+      "completions/min_terminated_length": 778.0,
+      "entropy": 0.7964543774724007,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052203768864274025,
+      "learning_rate": 1e-05,
+      "loss": 0.144,
+      "num_tokens": 260337614.0,
+      "reward": 0.46875,
+      "reward_std": 0.37928223609924316,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 7.032832218101248e-05,
+      "sampling/sampling_logp_difference/max": 9.562335968017578,
+      "sampling/sampling_logp_difference/mean": 0.017896221950650215,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 4.458271632756805e-05,
+      "clip_ratio/high_mean": 1.1145679081892013e-05,
+      "clip_ratio/low_mean": 6.243192206056847e-05,
+      "clip_ratio/low_min": 1.2397775662975619e-05,
+      "clip_ratio/region_mean": 7.357759886872373e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7029.4375,
+      "completions/mean_terminated_length": 6880.95263671875,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "entropy": 0.8605096861720085,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005570738110691309,
+      "learning_rate": 1e-05,
+      "loss": 0.0984,
+      "num_tokens": 261254070.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3327290117740631,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999494552612305,
+      "sampling/importance_sampling_ratio/min": 0.0009070249507203698,
+      "sampling/sampling_logp_difference/max": 7.005340576171875,
+      "sampling/sampling_logp_difference/mean": 0.01905740052461624,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 3.390461233720998e-05,
+      "clip_ratio/high_mean": 1.1191766247975465e-05,
+      "clip_ratio/low_mean": 7.46641262594494e-05,
+      "clip_ratio/low_min": 5.041745680500753e-06,
+      "clip_ratio/region_mean": 8.585589102949598e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5858.84375,
+      "completions/mean_terminated_length": 5606.240234375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.8430554121732712,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004496110137552023,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 262024906.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294877052307,
+      "sampling/importance_sampling_ratio/min": 0.00040469475788995624,
+      "sampling/sampling_logp_difference/max": 7.812377452850342,
+      "sampling/sampling_logp_difference/mean": 0.019225869327783585,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 3.2563955301156966e-06,
+      "clip_ratio/high_mean": 8.140988825289242e-07,
+      "clip_ratio/low_mean": 3.7080020149460324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.789411886145899e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15976.0,
+      "completions/mean_length": 8337.328125,
+      "completions/mean_terminated_length": 7728.7568359375,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.901745393872261,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00348713924176991,
+      "learning_rate": 1e-05,
+      "loss": -0.0002,
+      "num_tokens": 263110844.0,
+      "reward": 0.296875,
+      "reward_std": 0.20805485546588898,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998900890350342,
+      "sampling/importance_sampling_ratio/min": 0.0022652465850114822,
+      "sampling/sampling_logp_difference/max": 6.090071678161621,
+      "sampling/sampling_logp_difference/mean": 0.02157524600625038,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 2.3739744847262045e-05,
+      "clip_ratio/high_mean": 5.934936211815511e-06,
+      "clip_ratio/low_mean": 2.823553325015382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.417046866616147e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 7084.7265625,
+      "completions/mean_terminated_length": 6381.42041015625,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8265534415841103,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003980033565312624,
+      "learning_rate": 1e-05,
+      "loss": 0.0551,
+      "num_tokens": 264036169.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673366546631,
+      "sampling/importance_sampling_ratio/min": 0.00012345099821686745,
+      "sampling/sampling_logp_difference/max": 8.999666213989258,
+      "sampling/sampling_logp_difference/mean": 0.018782664090394974,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 1.1745505617000163e-05,
+      "clip_ratio/high_mean": 3.771558226617344e-06,
+      "clip_ratio/low_mean": 6.913120819262986e-05,
+      "clip_ratio/low_min": 2.494283216947224e-05,
+      "clip_ratio/region_mean": 7.290276607818669e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6543.796875,
+      "completions/mean_terminated_length": 6543.796875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8899869695305824,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.006467343773692846,
+      "learning_rate": 1e-05,
+      "loss": 0.1139,
+      "num_tokens": 264892767.0,
+      "reward": 0.484375,
+      "reward_std": 0.3934885561466217,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000489950180054,
+      "sampling/importance_sampling_ratio/min": 9.891482477542013e-05,
+      "sampling/sampling_logp_difference/max": 9.221251487731934,
+      "sampling/sampling_logp_difference/mean": 0.02032080665230751,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.395576979732141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.395576979732141e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16307.0,
+      "completions/mean_length": 8483.390625,
+      "completions/mean_terminated_length": 7813.84765625,
+      "completions/min_length": 1342.0,
+      "completions/min_terminated_length": 1342.0,
+      "entropy": 0.9621479511260986,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003174177836626768,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 265995697.0,
+      "reward": 0.3359375,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.0005628522485494614,
+      "sampling/sampling_logp_difference/max": 7.4824934005737305,
+      "sampling/sampling_logp_difference/mean": 0.02145479805767536,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.2596524811669951e-05,
+      "clip_ratio/high_mean": 3.149131202917488e-06,
+      "clip_ratio/low_mean": 3.7911659774181317e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.106079018129094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14985.0,
+      "completions/mean_length": 7184.578125,
+      "completions/mean_terminated_length": 6963.79248046875,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.9993807673454285,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003356153378263116,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 266937707.0,
+      "reward": 0.3828125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000238418579102,
+      "sampling/importance_sampling_ratio/min": 0.0017036627978086472,
+      "sampling/sampling_logp_difference/max": 6.374974727630615,
+      "sampling/sampling_logp_difference/mean": 0.02204768732190132,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 1.9245163684900035e-05,
+      "clip_ratio/high_mean": 4.811290921225009e-06,
+      "clip_ratio/low_mean": 4.8845648166206956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.365693925796222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16216.0,
+      "completions/mean_length": 7029.2265625,
+      "completions/mean_terminated_length": 6727.45947265625,
+      "completions/min_length": 851.0,
+      "completions/min_terminated_length": 851.0,
+      "entropy": 0.9139953926205635,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006375293247401714,
+      "learning_rate": 1e-05,
+      "loss": 0.0519,
+      "num_tokens": 267853880.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.010649868287146091,
+      "sampling/sampling_logp_difference/max": 4.542207717895508,
+      "sampling/sampling_logp_difference/mean": 0.020365029573440552,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 4.812504812434781e-06,
+      "clip_ratio/high_mean": 1.2031262031086953e-06,
+      "clip_ratio/low_mean": 2.5999243803198624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.720237000630732e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6188.0078125,
+      "completions/mean_terminated_length": 5943.30419921875,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.7640773430466652,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003697809297591448,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 268665721.0,
+      "reward": 0.5078125,
+      "reward_std": 0.20699402689933777,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372363090515,
+      "sampling/importance_sampling_ratio/min": 0.02927250787615776,
+      "sampling/sampling_logp_difference/max": 3.531106472015381,
+      "sampling/sampling_logp_difference/mean": 0.016581017524003983,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1358927824621787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1358927824621787e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 8128.21875,
+      "completions/mean_terminated_length": 7861.90283203125,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.8218234181404114,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002286596456542611,
+      "learning_rate": 1e-05,
+      "loss": 0.0763,
+      "num_tokens": 269726181.0,
+      "reward": 0.375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999798536300659,
+      "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06,
+      "sampling/sampling_logp_difference/max": 12.90043830871582,
+      "sampling/sampling_logp_difference/mean": 0.019403984770178795,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 1.4808477317274082e-05,
+      "clip_ratio/high_mean": 3.7021193293185206e-06,
+      "clip_ratio/low_mean": 3.0363167581981543e-05,
+      "clip_ratio/low_min": 6.364238288369961e-06,
+      "clip_ratio/region_mean": 3.4065286854456645e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 5673.3359375,
+      "completions/mean_terminated_length": 5503.32568359375,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.9275510385632515,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00485506234690547,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 270470616.0,
+      "reward": 0.4921875,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.0009123464697040617,
+      "sampling/sampling_logp_difference/max": 6.999490737915039,
+      "sampling/sampling_logp_difference/mean": 0.01881871558725834,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 1.1274602456978755e-05,
+      "clip_ratio/high_mean": 3.6739949109687586e-06,
+      "clip_ratio/low_mean": 3.968570712231667e-05,
+      "clip_ratio/low_min": 3.4213767321489286e-06,
+      "clip_ratio/region_mean": 4.335970191959859e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 6944.8984375,
+      "completions/mean_terminated_length": 6795.07177734375,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9335741624236107,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005874342750757933,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 271377723.0,
+      "reward": 0.390625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000594854354858,
+      "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05,
+      "sampling/sampling_logp_difference/max": 10.049861907958984,
+      "sampling/sampling_logp_difference/mean": 0.020590776577591896,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 1.264126694877632e-05,
+      "clip_ratio/high_mean": 3.16031673719408e-06,
+      "clip_ratio/low_mean": 3.206376845810155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.522408474054828e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 7705.625,
+      "completions/mean_terminated_length": 7278.8193359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.8491624072194099,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001684082904830575,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 272384891.0,
+      "reward": 0.390625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 6.605865200981498e-05,
+      "sampling/sampling_logp_difference/max": 9.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.020136822015047073,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 9.772357770998497e-06,
+      "clip_ratio/high_mean": 2.443089442749624e-06,
+      "clip_ratio/low_mean": 3.8573590472879005e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.101667946088128e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6611.1484375,
+      "completions/mean_terminated_length": 6534.19677734375,
+      "completions/min_length": 1116.0,
+      "completions/min_terminated_length": 1116.0,
+      "entropy": 0.8867302760481834,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003692191792652011,
+      "learning_rate": 1e-05,
+      "loss": 0.1233,
+      "num_tokens": 273251630.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999606609344482,
+      "sampling/importance_sampling_ratio/min": 0.0031062732450664043,
+      "sampling/sampling_logp_difference/max": 5.774331569671631,
+      "sampling/sampling_logp_difference/mean": 0.019237037748098373,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 3.0103737344688852e-05,
+      "clip_ratio/high_mean": 9.664363972206047e-06,
+      "clip_ratio/low_mean": 1.7575501146893657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.723986426644842e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15786.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 6770.46875,
+      "completions/mean_terminated_length": 6770.46875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.8252957463264465,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004167635925114155,
+      "learning_rate": 1e-05,
+      "loss": -0.0072,
+      "num_tokens": 274146482.0,
+      "reward": 0.5703125,
+      "reward_std": 0.23486016690731049,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.00010247006866848096,
+      "sampling/sampling_logp_difference/max": 9.18593978881836,
+      "sampling/sampling_logp_difference/mean": 0.019684650003910065,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 6.529460733872838e-06,
+      "clip_ratio/high_mean": 1.6323651834682096e-06,
+      "clip_ratio/low_mean": 3.877351048231503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.040587566578324e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15827.0,
+      "completions/mean_length": 8210.859375,
+      "completions/mean_terminated_length": 7365.36181640625,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.8118235394358635,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030363225378096104,
+      "learning_rate": 1e-05,
+      "loss": 0.0531,
+      "num_tokens": 275214040.0,
+      "reward": 0.3515625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998943209648132,
+      "sampling/importance_sampling_ratio/min": 0.002854935359209776,
+      "sampling/sampling_logp_difference/max": 5.858705997467041,
+      "sampling/sampling_logp_difference/mean": 0.019275270402431488,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.0800629146106075e-06,
+      "clip_ratio/high_mean": 1.7700157286526519e-06,
+      "clip_ratio/low_mean": 2.3981688286767167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5751703674359305e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14900.0,
+      "completions/mean_length": 7072.8828125,
+      "completions/mean_terminated_length": 6849.41650390625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8018335327506065,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004777858033776283,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 276138049.0,
+      "reward": 0.453125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368190765381,
+      "sampling/importance_sampling_ratio/min": 0.0028502768836915493,
+      "sampling/sampling_logp_difference/max": 5.860339164733887,
+      "sampling/sampling_logp_difference/mean": 0.01849908009171486,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 2.259368602608447e-05,
+      "clip_ratio/high_mean": 5.648421506521117e-06,
+      "clip_ratio/low_mean": 4.28424866640853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.849090737479855e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14447.0,
+      "completions/mean_length": 5889.8359375,
+      "completions/mean_terminated_length": 5723.26220703125,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.7976400703191757,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030593445990234613,
+      "learning_rate": 1e-05,
+      "loss": 0.1331,
+      "num_tokens": 276910124.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999091029167175,
+      "sampling/importance_sampling_ratio/min": 0.000139843366923742,
+      "sampling/sampling_logp_difference/max": 8.874987602233887,
+      "sampling/sampling_logp_difference/mean": 0.01834402233362198,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 1.4654247024736833e-05,
+      "clip_ratio/high_mean": 3.663561756184208e-06,
+      "clip_ratio/low_mean": 2.377464920755301e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7438210736363544e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 7144.265625,
+      "completions/mean_terminated_length": 6689.85205078125,
+      "completions/min_length": 1200.0,
+      "completions/min_terminated_length": 1200.0,
+      "entropy": 0.8309404999017715,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004245694726705551,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 277843542.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998534321784973,
+      "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05,
+      "sampling/sampling_logp_difference/max": 11.499897956848145,
+      "sampling/sampling_logp_difference/mean": 0.01875344291329384,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 6.252500952541595e-06,
+      "clip_ratio/high_mean": 2.241558604509919e-06,
+      "clip_ratio/low_mean": 4.735765514851664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9599213525652885e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15722.0,
+      "completions/mean_length": 6779.5234375,
+      "completions/mean_terminated_length": 6703.8974609375,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.9584890529513359,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035574575886130333,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 278730129.0,
+      "reward": 0.3984375,
+      "reward_std": 0.32825323939323425,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.005792221520096064,
+      "sampling/sampling_logp_difference/max": 5.151239395141602,
+      "sampling/sampling_logp_difference/mean": 0.02137477695941925,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 3.2948471016425174e-05,
+      "clip_ratio/high_mean": 9.518853403278627e-06,
+      "clip_ratio/low_mean": 2.195712454522436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.14759782895635e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15892.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 5582.9765625,
+      "completions/mean_terminated_length": 5582.9765625,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8629376217722893,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037982752546668053,
+      "learning_rate": 1e-05,
+      "loss": 0.0331,
+      "num_tokens": 279462542.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3164186477661133,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999780058860779,
+      "sampling/importance_sampling_ratio/min": 0.0021874974481761456,
+      "sampling/sampling_logp_difference/max": 6.124997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01906203106045723,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.1029473625967512e-05,
+      "clip_ratio/high_mean": 2.757368406491878e-06,
+      "clip_ratio/low_mean": 5.367386921761863e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6431237737797346e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 6942.2578125,
+      "completions/mean_terminated_length": 6477.90966796875,
+      "completions/min_length": 1156.0,
+      "completions/min_terminated_length": 1156.0,
+      "entropy": 0.8147861957550049,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027678858023136854,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 280370207.0,
+      "reward": 0.4375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998471736907959,
+      "sampling/importance_sampling_ratio/min": 0.00023058800434228033,
+      "sampling/sampling_logp_difference/max": 8.3748779296875,
+      "sampling/sampling_logp_difference/mean": 0.01940828748047352,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 2.6367894406575942e-05,
+      "clip_ratio/high_mean": 8.765707434577052e-06,
+      "clip_ratio/low_mean": 3.232976985145797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.109547796815605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6242.53125,
+      "completions/mean_terminated_length": 5915.38671875,
+      "completions/min_length": 1220.0,
+      "completions/min_terminated_length": 1220.0,
+      "entropy": 0.878915011882782,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00577945914119482,
+      "learning_rate": 1e-05,
+      "loss": 0.0839,
+      "num_tokens": 281189491.0,
+      "reward": 0.515625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 9.611724817659706e-05,
+      "sampling/sampling_logp_difference/max": 9.2499418258667,
+      "sampling/sampling_logp_difference/mean": 0.01948760263621807,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 3.50839609382092e-05,
+      "clip_ratio/high_mean": 1.1664920634757436e-05,
+      "clip_ratio/low_mean": 1.833109013205103e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9996010880495305e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 7004.015625,
+      "completions/mean_terminated_length": 6622.71533203125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "entropy": 0.7964659407734871,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014128695474937558,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 282103997.0,
+      "reward": 0.4140625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.0024504722096025944,
+      "sampling/sampling_logp_difference/max": 6.011474609375,
+      "sampling/sampling_logp_difference/mean": 0.019019678235054016,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.832260545597819e-05,
+      "clip_ratio/high_mean": 4.580651363994548e-06,
+      "clip_ratio/low_mean": 5.309064226821647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.767129368905444e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7822.6953125,
+      "completions/mean_terminated_length": 7546.52392578125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.8571138679981232,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002476039342582226,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 283122382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999314546585083,
+      "sampling/importance_sampling_ratio/min": 0.0009774373611435294,
+      "sampling/sampling_logp_difference/max": 6.930576324462891,
+      "sampling/sampling_logp_difference/mean": 0.020557202398777008,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 5.738419986300869e-06,
+      "clip_ratio/high_mean": 1.4346049965752172e-06,
+      "clip_ratio/low_mean": 4.19679121819172e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3402517292179255e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7738.8984375,
+      "completions/mean_terminated_length": 6844.57763671875,
+      "completions/min_length": 897.0,
+      "completions/min_terminated_length": 897.0,
+      "entropy": 0.7839021533727646,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005309853237122297,
+      "learning_rate": 1e-05,
+      "loss": 0.043,
+      "num_tokens": 284130081.0,
+      "reward": 0.5234375,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998971223831177,
+      "sampling/importance_sampling_ratio/min": 0.0001319014554610476,
+      "sampling/sampling_logp_difference/max": 8.933455467224121,
+      "sampling/sampling_logp_difference/mean": 0.01873316988348961,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 1.007085802484653e-05,
+      "clip_ratio/high_mean": 2.5177145062116324e-06,
+      "clip_ratio/low_mean": 4.043528815600439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.295300277590286e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15952.0,
+      "completions/mean_length": 7102.2421875,
+      "completions/mean_terminated_length": 6954.9130859375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.8530801385641098,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004228116944432259,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 285058720.0,
+      "reward": 0.5078125,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00012956927821505815,
+      "sampling/sampling_logp_difference/max": 8.951294898986816,
+      "sampling/sampling_logp_difference/mean": 0.019325006753206253,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 4.06874551117653e-06,
+      "clip_ratio/high_mean": 1.0171863777941326e-06,
+      "clip_ratio/low_mean": 3.661125703047219e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.762844340826632e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15594.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6583.4765625,
+      "completions/mean_terminated_length": 6583.4765625,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 1.021921381354332,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004967439454048872,
+      "learning_rate": 1e-05,
+      "loss": 0.0374,
+      "num_tokens": 285919765.0,
+      "reward": 0.328125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00004243850708,
+      "sampling/importance_sampling_ratio/min": 0.016675354912877083,
+      "sampling/sampling_logp_difference/max": 4.093823432922363,
+      "sampling/sampling_logp_difference/mean": 0.021393200382590294,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.2215251445013564e-05,
+      "clip_ratio/high_mean": 3.053812861253391e-06,
+      "clip_ratio/low_mean": 4.05305947879242e-05,
+      "clip_ratio/low_min": 4.215567059873138e-06,
+      "clip_ratio/region_mean": 4.358440742180392e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16299.0,
+      "completions/mean_length": 7770.5859375,
+      "completions/mean_terminated_length": 7346.97509765625,
+      "completions/min_length": 1040.0,
+      "completions/min_terminated_length": 1040.0,
+      "entropy": 1.0466903448104858,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004189736675471067,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 286935512.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2369818240404129,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797344207764,
+      "sampling/importance_sampling_ratio/min": 0.011683559976518154,
+      "sampling/sampling_logp_difference/max": 4.449572563171387,
+      "sampling/sampling_logp_difference/mean": 0.021805983036756516,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 2.0567378214764176e-05,
+      "clip_ratio/high_mean": 5.141844553691044e-06,
+      "clip_ratio/low_mean": 1.8177100628236076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3318944840866607e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15758.0,
+      "completions/mean_length": 5689.2421875,
+      "completions/mean_terminated_length": 5432.568359375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.7778806164860725,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0032866497058421373,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 287681943.0,
+      "reward": 0.640625,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940812587738,
+      "sampling/importance_sampling_ratio/min": 0.00038077132194302976,
+      "sampling/sampling_logp_difference/max": 7.873311519622803,
+      "sampling/sampling_logp_difference/mean": 0.01789461076259613,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 3.109086901531555e-05,
+      "clip_ratio/high_mean": 7.772717253828887e-06,
+      "clip_ratio/low_mean": 3.1423560130861006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.919627738468989e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13820.0,
+      "completions/mean_length": 6288.1875,
+      "completions/mean_terminated_length": 6127.93701171875,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.7709921672940254,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023572889622300863,
+      "learning_rate": 1e-05,
+      "loss": 0.0746,
+      "num_tokens": 288506735.0,
+      "reward": 0.484375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474287033081,
+      "sampling/importance_sampling_ratio/min": 0.000430915504693985,
+      "sampling/sampling_logp_difference/max": 7.749598503112793,
+      "sampling/sampling_logp_difference/mean": 0.017407266423106194,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 3.4638953366084024e-05,
+      "clip_ratio/high_mean": 9.51674803673086e-06,
+      "clip_ratio/low_mean": 6.26047980176736e-05,
+      "clip_ratio/low_min": 5.51267930859467e-06,
+      "clip_ratio/region_mean": 7.212154741864651e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 6775.0234375,
+      "completions/mean_terminated_length": 6465.05615234375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9338318258523941,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034220058005303144,
+      "learning_rate": 1e-05,
+      "loss": 0.0986,
+      "num_tokens": 289395498.0,
+      "reward": 0.390625,
+      "reward_std": 0.34533774852752686,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603033065796,
+      "sampling/importance_sampling_ratio/min": 0.0317598432302475,
+      "sampling/sampling_logp_difference/max": 3.449552536010742,
+      "sampling/sampling_logp_difference/mean": 0.019930530339479446,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 7.159989991123439e-05,
+      "clip_ratio/low_min": 1.5592839645250933e-05,
+      "clip_ratio/region_mean": 7.159989991123439e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 7142.9375,
+      "completions/mean_terminated_length": 6844.83837890625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 0.971405878663063,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002513247774913907,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 290329082.0,
+      "reward": 0.328125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999737739562988,
+      "sampling/importance_sampling_ratio/min": 3.152207455059397e-07,
+      "sampling/sampling_logp_difference/max": 14.969992637634277,
+      "sampling/sampling_logp_difference/mean": 0.022366533055901527,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 1.6507752206962323e-05,
+      "clip_ratio/high_mean": 4.126938051740581e-06,
+      "clip_ratio/low_mean": 1.7493430505055585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1620368215735652e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15581.0,
+      "completions/mean_length": 6412.2109375,
+      "completions/mean_terminated_length": 6333.69287109375,
+      "completions/min_length": 544.0,
+      "completions/min_terminated_length": 544.0,
+      "entropy": 0.9136044681072235,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0056767817586660385,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 291170133.0,
+      "reward": 0.421875,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999720454216003,
+      "sampling/importance_sampling_ratio/min": 0.000458698661532253,
+      "sampling/sampling_logp_difference/max": 7.687117099761963,
+      "sampling/sampling_logp_difference/mean": 0.020012658089399338,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 8.26085442895419e-06,
+      "clip_ratio/high_mean": 2.0652136072385474e-06,
+      "clip_ratio/low_mean": 3.6938338666914206e-05,
+      "clip_ratio/low_min": 5.699044777429663e-06,
+      "clip_ratio/region_mean": 3.900355193309224e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16111.0,
+      "completions/mean_length": 8066.1015625,
+      "completions/mean_terminated_length": 7797.7822265625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 1.0789504647254944,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00243841833434999,
+      "learning_rate": 1e-05,
+      "loss": 0.0432,
+      "num_tokens": 292222082.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999664425849915,
+      "sampling/importance_sampling_ratio/min": 8.481895929435268e-05,
+      "sampling/sampling_logp_difference/max": 9.374991416931152,
+      "sampling/sampling_logp_difference/mean": 0.023650091141462326,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 5.320054697222076e-06,
+      "clip_ratio/high_mean": 1.330013674305519e-06,
+      "clip_ratio/low_mean": 1.9117383317279746e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0447396991585265e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15176.0,
+      "completions/mean_length": 6836.046875,
+      "completions/mean_terminated_length": 6606.896484375,
+      "completions/min_length": 785.0,
+      "completions/min_terminated_length": 785.0,
+      "entropy": 1.218759760260582,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0020856577903032303,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 293115984.0,
+      "reward": 0.21875,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999911785125732,
+      "sampling/importance_sampling_ratio/min": 2.784526441246271e-05,
+      "sampling/sampling_logp_difference/max": 10.488847732543945,
+      "sampling/sampling_logp_difference/mean": 0.022012067958712578,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 2.5695502699818462e-05,
+      "clip_ratio/high_mean": 7.549717793153832e-06,
+      "clip_ratio/low_mean": 4.6741323160404136e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.429104089671455e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15796.0,
+      "completions/mean_length": 7501.9921875,
+      "completions/mean_terminated_length": 7140.9345703125,
+      "completions/min_length": 1237.0,
+      "completions/min_terminated_length": 1237.0,
+      "entropy": 0.8940394818782806,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005163854919373989,
+      "learning_rate": 1e-05,
+      "loss": 0.0354,
+      "num_tokens": 294099503.0,
+      "reward": 0.328125,
+      "reward_std": 0.30904707312583923,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999276399612427,
+      "sampling/importance_sampling_ratio/min": 0.0006545600481331348,
+      "sampling/sampling_logp_difference/max": 7.331547260284424,
+      "sampling/sampling_logp_difference/mean": 0.020813245326280594,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 3.1606674838258186e-05,
+      "clip_ratio/high_mean": 9.45794374729303e-06,
+      "clip_ratio/low_mean": 4.5567895540443715e-05,
+      "clip_ratio/low_min": 4.458871444512624e-06,
+      "clip_ratio/region_mean": 5.502583962879726e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7204.828125,
+      "completions/mean_terminated_length": 6908.7255859375,
+      "completions/min_length": 846.0,
+      "completions/min_terminated_length": 846.0,
+      "entropy": 0.9961872175335884,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029277894645929337,
+      "learning_rate": 1e-05,
+      "loss": 0.0963,
+      "num_tokens": 295042105.0,
+      "reward": 0.390625,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000677108764648,
+      "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05,
+      "sampling/sampling_logp_difference/max": 10.872637748718262,
+      "sampling/sampling_logp_difference/mean": 0.020187582820653915,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 1.7963964182854397e-05,
+      "clip_ratio/high_mean": 5.194059781388205e-06,
+      "clip_ratio/low_mean": 1.8380221035840805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.357428081722901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15856.0,
+      "completions/mean_length": 6256.859375,
+      "completions/mean_terminated_length": 6013.80810546875,
+      "completions/min_length": 1006.0,
+      "completions/min_terminated_length": 1006.0,
+      "entropy": 0.9293600022792816,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032952844630926847,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 295867039.0,
+      "reward": 0.46875,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999649524688721,
+      "sampling/importance_sampling_ratio/min": 7.995560008566827e-05,
+      "sampling/sampling_logp_difference/max": 9.434039115905762,
+      "sampling/sampling_logp_difference/mean": 0.019491540268063545,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 7.577551059512189e-06,
+      "clip_ratio/high_mean": 1.8943877648780472e-06,
+      "clip_ratio/low_mean": 2.7479814093567256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9374201631071628e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15412.0,
+      "completions/mean_length": 7397.84375,
+      "completions/mean_terminated_length": 7032.552734375,
+      "completions/min_length": 923.0,
+      "completions/min_terminated_length": 923.0,
+      "entropy": 0.8508890569210052,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029417150653898716,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 296832843.0,
+      "reward": 0.375,
+      "reward_std": 0.2867125868797302,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000183582305908,
+      "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05,
+      "sampling/sampling_logp_difference/max": 10.93724250793457,
+      "sampling/sampling_logp_difference/mean": 0.01975393109023571,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 3.281225508544594e-05,
+      "clip_ratio/high_mean": 1.3302957199812226e-05,
+      "clip_ratio/low_mean": 5.109179869577929e-05,
+      "clip_ratio/low_min": 6.657612175331451e-06,
+      "clip_ratio/region_mean": 6.439475532715733e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14983.0,
+      "completions/mean_length": 6897.765625,
+      "completions/mean_terminated_length": 6823.07080078125,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.9046694040298462,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026788609102368355,
+      "learning_rate": 1e-05,
+      "loss": 0.0664,
+      "num_tokens": 297735285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3266732692718506,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999909520149231,
+      "sampling/importance_sampling_ratio/min": 0.001710799871943891,
+      "sampling/sampling_logp_difference/max": 6.370794296264648,
+      "sampling/sampling_logp_difference/mean": 0.020578179508447647,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 1.7319889593636617e-05,
+      "clip_ratio/high_mean": 5.168538336874917e-06,
+      "clip_ratio/low_mean": 7.019768918326008e-05,
+      "clip_ratio/low_min": 2.541147478041239e-05,
+      "clip_ratio/region_mean": 7.53662266106403e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15525.0,
+      "completions/mean_length": 6971.9921875,
+      "completions/mean_terminated_length": 6509.10595703125,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "entropy": 0.8658201694488525,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005915141198784113,
+      "learning_rate": 1e-05,
+      "loss": 0.0923,
+      "num_tokens": 298645124.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3742823898792267,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999268651008606,
+      "sampling/importance_sampling_ratio/min": 0.000970841443631798,
+      "sampling/sampling_logp_difference/max": 6.937347412109375,
+      "sampling/sampling_logp_difference/mean": 0.01906151883304119,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.8332865238335216e-05,
+      "clip_ratio/high_mean": 4.583216309583804e-06,
+      "clip_ratio/low_mean": 6.167940273371642e-05,
+      "clip_ratio/low_min": 5.969151516183047e-06,
+      "clip_ratio/region_mean": 6.626261847486603e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15054.0,
+      "completions/mean_length": 6545.6953125,
+      "completions/mean_terminated_length": 5889.80859375,
+      "completions/min_length": 800.0,
+      "completions/min_terminated_length": 800.0,
+      "entropy": 0.779609851539135,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0032792428974062204,
+      "learning_rate": 1e-05,
+      "loss": 0.097,
+      "num_tokens": 299503781.0,
+      "reward": 0.609375,
+      "reward_std": 0.38293448090553284,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999361634254456,
+      "sampling/importance_sampling_ratio/min": 0.002187495119869709,
+      "sampling/sampling_logp_difference/max": 6.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.017413027584552765,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.46246323235755e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.46246323235755e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7226.515625,
+      "completions/mean_terminated_length": 7006.736328125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9573849961161613,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005092279519885778,
+      "learning_rate": 1e-05,
+      "loss": 0.1102,
+      "num_tokens": 300447903.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999373555183411,
+      "sampling/importance_sampling_ratio/min": 0.000627054600045085,
+      "sampling/sampling_logp_difference/max": 7.374476909637451,
+      "sampling/sampling_logp_difference/mean": 0.021570835262537003,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 5.487269390869187e-06,
+      "clip_ratio/high_mean": 1.3718173477172968e-06,
+      "clip_ratio/low_mean": 4.7280102080549113e-05,
+      "clip_ratio/low_min": 1.0166083029616857e-05,
+      "clip_ratio/region_mean": 4.865191931457957e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14967.0,
+      "completions/mean_length": 5755.171875,
+      "completions/mean_terminated_length": 5323.10546875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8482184633612633,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005033228080719709,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 301206021.0,
+      "reward": 0.390625,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999947547912598,
+      "sampling/importance_sampling_ratio/min": 0.0014573346124961972,
+      "sampling/sampling_logp_difference/max": 6.531146049499512,
+      "sampling/sampling_logp_difference/mean": 0.018870476633310318,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 5.421346941147931e-06,
+      "clip_ratio/high_mean": 1.3553367352869827e-06,
+      "clip_ratio/low_mean": 1.6510994441887306e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.786633117717429e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 7098.7265625,
+      "completions/mean_terminated_length": 6875.88037109375,
+      "completions/min_length": 947.0,
+      "completions/min_terminated_length": 947.0,
+      "entropy": 0.87320177257061,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.007659573573619127,
+      "learning_rate": 1e-05,
+      "loss": 0.0707,
+      "num_tokens": 302133890.0,
+      "reward": 0.421875,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0012466582702472806,
+      "sampling/sampling_logp_difference/max": 6.687288761138916,
+      "sampling/sampling_logp_difference/mean": 0.019994346424937248,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 1.1556229310372146e-05,
+      "clip_ratio/high_mean": 2.8890573275930365e-06,
+      "clip_ratio/low_mean": 3.8744643916288624e-05,
+      "clip_ratio/low_min": 6.108287834649673e-06,
+      "clip_ratio/region_mean": 4.1633702039689524e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16139.0,
+      "completions/mean_length": 6399.96875,
+      "completions/mean_terminated_length": 6077.90283203125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9481896534562111,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014135175151750445,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 302972566.0,
+      "reward": 0.4140625,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0025698256213217974,
+      "sampling/sampling_logp_difference/max": 5.963917255401611,
+      "sampling/sampling_logp_difference/mean": 0.02073008380830288,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 6.59491388432798e-06,
+      "clip_ratio/high_mean": 2.545892130001448e-06,
+      "clip_ratio/low_mean": 4.620846755187813e-05,
+      "clip_ratio/low_min": 6.243132702365983e-06,
+      "clip_ratio/region_mean": 4.875435956819274e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 7298.078125,
+      "completions/mean_terminated_length": 7226.53564453125,
+      "completions/min_length": 1009.0,
+      "completions/min_terminated_length": 1009.0,
+      "entropy": 0.8719206526875496,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027898226398974657,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 303925976.0,
+      "reward": 0.484375,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.005236432887613773,
+      "sampling/sampling_logp_difference/max": 5.252114772796631,
+      "sampling/sampling_logp_difference/mean": 0.020944103598594666,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 1.052124343914329e-05,
+      "clip_ratio/high_mean": 2.6303108597858227e-06,
+      "clip_ratio/low_mean": 2.010384196182713e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.273415248055244e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14980.0,
+      "completions/mean_length": 5667.0390625,
+      "completions/mean_terminated_length": 5496.9287109375,
+      "completions/min_length": 974.0,
+      "completions/min_terminated_length": 974.0,
+      "entropy": 0.8791451379656792,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012764945859089494,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 304675157.0,
+      "reward": 0.390625,
+      "reward_std": 0.17965976893901825,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000383853912354,
+      "sampling/importance_sampling_ratio/min": 5.054428584116977e-06,
+      "sampling/sampling_logp_difference/max": 12.195245742797852,
+      "sampling/sampling_logp_difference/mean": 0.018928447738289833,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 9.578045592206763e-06,
+      "clip_ratio/high_mean": 2.3945113980516908e-06,
+      "clip_ratio/low_mean": 3.1114799753595435e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350931149270764e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15354.0,
+      "completions/max_terminated_length": 15354.0,
+      "completions/mean_length": 5874.4453125,
+      "completions/mean_terminated_length": 5874.4453125,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9577538818120956,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00509974779561162,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 305447038.0,
+      "reward": 0.515625,
+      "reward_std": 0.24777325987815857,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999423027038574,
+      "sampling/importance_sampling_ratio/min": 0.004791648127138615,
+      "sampling/sampling_logp_difference/max": 5.340880870819092,
+      "sampling/sampling_logp_difference/mean": 0.02114470861852169,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 1.0903062275247066e-05,
+      "clip_ratio/high_mean": 2.7257655688117666e-06,
+      "clip_ratio/low_mean": 4.784364205079328e-05,
+      "clip_ratio/low_min": 3.861600362142781e-06,
+      "clip_ratio/region_mean": 5.056940744907479e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 6197.5703125,
+      "completions/mean_terminated_length": 6035.88134765625,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.8665244281291962,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030849494505673647,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 306258023.0,
+      "reward": 0.515625,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998056888580322,
+      "sampling/importance_sampling_ratio/min": 0.000830297009088099,
+      "sampling/sampling_logp_difference/max": 7.093727111816406,
+      "sampling/sampling_logp_difference/mean": 0.021017421036958694,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 1.4299712574938894e-05,
+      "clip_ratio/high_mean": 4.3520980170796975e-06,
+      "clip_ratio/low_mean": 6.213493452378316e-05,
+      "clip_ratio/low_min": 1.0056635801447555e-05,
+      "clip_ratio/region_mean": 6.648703174505499e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 7522.578125,
+      "completions/mean_terminated_length": 7381.9208984375,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.8185881152749062,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002946985885500908,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 307240305.0,
+      "reward": 0.3125,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.005127199459820986,
+      "sampling/sampling_logp_difference/max": 5.273195743560791,
+      "sampling/sampling_logp_difference/mean": 0.01965932548046112,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.693051035545068e-05,
+      "clip_ratio/high_mean": 5.08456730585749e-06,
+      "clip_ratio/low_mean": 4.2052345861520735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.713691282631771e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14090.0,
+      "completions/mean_length": 6403.2265625,
+      "completions/mean_terminated_length": 6163.6884765625,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "entropy": 0.8359840363264084,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031181599479168653,
+      "learning_rate": 1e-05,
+      "loss": 0.072,
+      "num_tokens": 308079318.0,
+      "reward": 0.5,
+      "reward_std": 0.27145031094551086,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999215602874756,
+      "sampling/importance_sampling_ratio/min": 6.73715621815063e-05,
+      "sampling/sampling_logp_difference/max": 9.605287551879883,
+      "sampling/sampling_logp_difference/mean": 0.01963040418922901,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 1.3988919135954347e-05,
+      "clip_ratio/high_mean": 3.497229783988587e-06,
+      "clip_ratio/low_mean": 6.722658486069122e-05,
+      "clip_ratio/low_min": 1.858519090092159e-05,
+      "clip_ratio/region_mean": 7.072381458783639e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16148.0,
+      "completions/mean_length": 7954.03125,
+      "completions/mean_terminated_length": 7751.71240234375,
+      "completions/min_length": 632.0,
+      "completions/min_terminated_length": 632.0,
+      "entropy": 0.905990719795227,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002656223252415657,
+      "learning_rate": 1e-05,
+      "loss": 0.1022,
+      "num_tokens": 309117770.0,
+      "reward": 0.3828125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999536275863647,
+      "sampling/importance_sampling_ratio/min": 0.0003354826185386628,
+      "sampling/sampling_logp_difference/max": 7.999940395355225,
+      "sampling/sampling_logp_difference/mean": 0.020741507411003113,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.7610595023143105e-05,
+      "clip_ratio/high_mean": 4.402648755785776e-06,
+      "clip_ratio/low_mean": 4.337988764291367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.778253651238629e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6630.09375,
+      "completions/mean_terminated_length": 6315.45166015625,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.870736837387085,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0060529084876179695,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 309988894.0,
+      "reward": 0.515625,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998822212219238,
+      "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05,
+      "sampling/sampling_logp_difference/max": 10.716434478759766,
+      "sampling/sampling_logp_difference/mean": 0.02060208097100258,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 1.0448093235027045e-05,
+      "clip_ratio/high_mean": 2.6120233087567613e-06,
+      "clip_ratio/low_mean": 3.1030769946482906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.364279325523967e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15920.0,
+      "completions/max_terminated_length": 15920.0,
+      "completions/mean_length": 6679.6171875,
+      "completions/mean_terminated_length": 6679.6171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9812518879771233,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00400698184967041,
+      "learning_rate": 1e-05,
+      "loss": 0.0605,
+      "num_tokens": 310864013.0,
+      "reward": 0.421875,
+      "reward_std": 0.3295465111732483,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999049305915833,
+      "sampling/importance_sampling_ratio/min": 0.0020593837834894657,
+      "sampling/sampling_logp_difference/max": 6.1853485107421875,
+      "sampling/sampling_logp_difference/mean": 0.02098071575164795,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 2.124982574969181e-05,
+      "clip_ratio/high_mean": 7.736592579021817e-06,
+      "clip_ratio/low_mean": 2.900951585615985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.674610888992902e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14541.0,
+      "completions/mean_length": 5523.796875,
+      "completions/mean_terminated_length": 5173.4677734375,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9120645374059677,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005929585546255112,
+      "learning_rate": 1e-05,
+      "loss": 0.0362,
+      "num_tokens": 311589987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998446702957153,
+      "sampling/importance_sampling_ratio/min": 0.0010661041596904397,
+      "sampling/sampling_logp_difference/max": 6.843744277954102,
+      "sampling/sampling_logp_difference/mean": 0.019948206841945648,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 2.4486997745043482e-05,
+      "clip_ratio/high_mean": 8.219769085826556e-06,
+      "clip_ratio/low_mean": 5.346400575945154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.168377467474784e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15401.0,
+      "completions/mean_length": 6361.3671875,
+      "completions/mean_terminated_length": 6282.44873046875,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.8044678047299385,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006622390355914831,
+      "learning_rate": 1e-05,
+      "loss": 0.1023,
+      "num_tokens": 312424034.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3724474310874939,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000219345092773,
+      "sampling/importance_sampling_ratio/min": 0.0003157092141918838,
+      "sampling/sampling_logp_difference/max": 8.060688972473145,
+      "sampling/sampling_logp_difference/mean": 0.018907658755779266,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 1.0407376748844399e-05,
+      "clip_ratio/high_mean": 2.6018441872110998e-06,
+      "clip_ratio/low_mean": 5.925514369664597e-05,
+      "clip_ratio/low_min": 1.3324347946763737e-05,
+      "clip_ratio/region_mean": 6.185698703120579e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15883.0,
+      "completions/mean_length": 7109.0,
+      "completions/mean_terminated_length": 7035.96826171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9167275875806808,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004639944992959499,
+      "learning_rate": 1e-05,
+      "loss": 0.0861,
+      "num_tokens": 313353346.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3826971650123596,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999389052391052,
+      "sampling/importance_sampling_ratio/min": 0.0019070414127781987,
+      "sampling/sampling_logp_difference/max": 6.262202262878418,
+      "sampling/sampling_logp_difference/mean": 0.02155841514468193,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 3.959046694035351e-05,
+      "clip_ratio/high_mean": 1.0912523691786191e-05,
+      "clip_ratio/low_mean": 3.3944450819944905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.485697365907981e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6314.2734375,
+      "completions/mean_terminated_length": 6072.60009765625,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.8780038207769394,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.007643720600754023,
+      "learning_rate": 1e-05,
+      "loss": 0.0873,
+      "num_tokens": 314180717.0,
+      "reward": 0.4609375,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999802112579346,
+      "sampling/importance_sampling_ratio/min": 0.021285315975546837,
+      "sampling/sampling_logp_difference/max": 3.8497378826141357,
+      "sampling/sampling_logp_difference/mean": 0.01964358240365982,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 3.065382111344661e-05,
+      "clip_ratio/high_mean": 9.187473835936544e-06,
+      "clip_ratio/low_mean": 4.137891801292426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.056639065514901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6718.2265625,
+      "completions/mean_terminated_length": 6486.24853515625,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.8326799497008324,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050973957404494286,
+      "learning_rate": 1e-05,
+      "loss": 0.0109,
+      "num_tokens": 315060842.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3521803915500641,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000014066696167,
+      "sampling/importance_sampling_ratio/min": 0.0009130688849836588,
+      "sampling/sampling_logp_difference/max": 6.998699188232422,
+      "sampling/sampling_logp_difference/mean": 0.019501537084579468,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 8.624853762739804e-06,
+      "clip_ratio/high_mean": 2.156213440684951e-06,
+      "clip_ratio/low_mean": 1.8797969062234188e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0954182048171788e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16128.0,
+      "completions/mean_length": 8666.8359375,
+      "completions/mean_terminated_length": 7941.291015625,
+      "completions/min_length": 565.0,
+      "completions/min_terminated_length": 565.0,
+      "entropy": 0.9526705741882324,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019092690199613571,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 316190325.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999814629554749,
+      "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05,
+      "sampling/sampling_logp_difference/max": 10.249995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02051631174981594,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 2.147400391550036e-05,
+      "clip_ratio/high_mean": 6.434908300434472e-06,
+      "clip_ratio/low_mean": 3.521234066283796e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.164724816746457e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15164.0,
+      "completions/mean_length": 7661.8203125,
+      "completions/mean_terminated_length": 7002.16015625,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 0.8322782590985298,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019530428107827902,
+      "learning_rate": 1e-05,
+      "loss": 0.0729,
+      "num_tokens": 317191878.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21382391452789307,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 8.546619210392237e-05,
+      "sampling/sampling_logp_difference/max": 9.367389678955078,
+      "sampling/sampling_logp_difference/mean": 0.019894573837518692,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.9436202364886412e-05,
+      "clip_ratio/high_mean": 6.089704697842535e-06,
+      "clip_ratio/low_mean": 4.2698405422925134e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.878810955233348e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15934.0,
+      "completions/mean_length": 7024.859375,
+      "completions/mean_terminated_length": 6800.240234375,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.794853538274765,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031784537713974714,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 318109004.0,
+      "reward": 0.4921875,
+      "reward_std": 0.31800347566604614,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999352693557739,
+      "sampling/importance_sampling_ratio/min": 0.0002962362195830792,
+      "sampling/sampling_logp_difference/max": 8.124353408813477,
+      "sampling/sampling_logp_difference/mean": 0.018519200384616852,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 4.127455667912727e-06,
+      "clip_ratio/high_mean": 1.0318639169781818e-06,
+      "clip_ratio/low_mean": 4.342453667049995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.445640047379129e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 7282.1796875,
+      "completions/mean_terminated_length": 6912.1865234375,
+      "completions/min_length": 870.0,
+      "completions/min_terminated_length": 870.0,
+      "entropy": 0.904067650437355,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005080109462141991,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 319059075.0,
+      "reward": 0.4140625,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000062108039856,
+      "sampling/importance_sampling_ratio/min": 0.1194523349404335,
+      "sampling/sampling_logp_difference/max": 6.136754989624023,
+      "sampling/sampling_logp_difference/mean": 0.019978653639554977,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.608940076243016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.608940076243016e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15625.0,
+      "completions/mean_length": 7131.5234375,
+      "completions/mean_terminated_length": 6596.255859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.8849587142467499,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022667953744530678,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 319990046.0,
+      "reward": 0.46875,
+      "reward_std": 0.30221715569496155,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0370909757912159,
+      "sampling/sampling_logp_difference/max": 3.294381618499756,
+      "sampling/sampling_logp_difference/mean": 0.02037571743130684,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 1.5356635913121863e-05,
+      "clip_ratio/high_mean": 3.839158978280466e-06,
+      "clip_ratio/low_mean": 3.4950805911648786e-05,
+      "clip_ratio/low_min": 4.876336333836662e-06,
+      "clip_ratio/region_mean": 3.8789965287833184e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16205.0,
+      "completions/mean_length": 6655.4453125,
+      "completions/mean_terminated_length": 6578.84228515625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.7417122721672058,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00216497085057199,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 320860135.0,
+      "reward": 0.5625,
+      "reward_std": 0.3369230031967163,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 0.0005190494703128934,
+      "sampling/sampling_logp_difference/max": 7.563511371612549,
+      "sampling/sampling_logp_difference/mean": 0.01771342009305954,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 1.7605634639039636e-05,
+      "clip_ratio/high_mean": 5.297029474604642e-06,
+      "clip_ratio/low_mean": 5.688933060810086e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.218636053745286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15849.0,
+      "completions/mean_length": 7077.1640625,
+      "completions/mean_terminated_length": 6619.45068359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 0.8749325424432755,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0028338562697172165,
+      "learning_rate": 1e-05,
+      "loss": 0.0643,
+      "num_tokens": 321783852.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998220205307007,
+      "sampling/importance_sampling_ratio/min": 7.83290306571871e-06,
+      "sampling/sampling_logp_difference/max": 11.757177352905273,
+      "sampling/sampling_logp_difference/mean": 0.020299233496189117,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 7.301828190975357e-06,
+      "clip_ratio/high_mean": 1.8254570477438392e-06,
+      "clip_ratio/low_mean": 5.158197632226802e-05,
+      "clip_ratio/low_min": 3.735804057214409e-06,
+      "clip_ratio/region_mean": 5.340743223314348e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15329.0,
+      "completions/mean_length": 6034.296875,
+      "completions/mean_terminated_length": 5525.294921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.80014718323946,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022897711023688316,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 322572882.0,
+      "reward": 0.40625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999347925186157,
+      "sampling/importance_sampling_ratio/min": 0.0004105660773348063,
+      "sampling/sampling_logp_difference/max": 7.7979736328125,
+      "sampling/sampling_logp_difference/mean": 0.01858348958194256,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 9.364057859784225e-06,
+      "clip_ratio/high_mean": 3.351393047523743e-06,
+      "clip_ratio/low_mean": 4.186752630630508e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5218919240141986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 8172.109375,
+      "completions/mean_terminated_length": 7838.29248046875,
+      "completions/min_length": 733.0,
+      "completions/min_terminated_length": 733.0,
+      "entropy": 0.8732693120837212,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003263789461925626,
+      "learning_rate": 1e-05,
+      "loss": 0.0356,
+      "num_tokens": 323640904.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3237774670124054,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999354481697083,
+      "sampling/importance_sampling_ratio/min": 9.27252222027164e-06,
+      "sampling/sampling_logp_difference/max": 11.588455200195312,
+      "sampling/sampling_logp_difference/mean": 0.0208889190107584,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 2.0998899799451465e-05,
+      "clip_ratio/high_mean": 6.692962131182867e-06,
+      "clip_ratio/low_mean": 4.261424010110204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.930720297124935e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 7699.203125,
+      "completions/mean_terminated_length": 7419.04833984375,
+      "completions/min_length": 1225.0,
+      "completions/min_terminated_length": 1225.0,
+      "entropy": 0.8296505436301231,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0042716520838439465,
+      "learning_rate": 1e-05,
+      "loss": 0.0937,
+      "num_tokens": 324643858.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874234199524,
+      "sampling/importance_sampling_ratio/min": 0.00022192654432728887,
+      "sampling/sampling_logp_difference/max": 8.413164138793945,
+      "sampling/sampling_logp_difference/mean": 0.018926654011011124,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 7.061349151626928e-06,
+      "clip_ratio/high_mean": 1.765337287906732e-06,
+      "clip_ratio/low_mean": 4.5005243464402156e-05,
+      "clip_ratio/low_min": 3.861838649754645e-06,
+      "clip_ratio/region_mean": 4.6770580411248375e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16364.0,
+      "completions/max_terminated_length": 16364.0,
+      "completions/mean_length": 7450.1640625,
+      "completions/mean_terminated_length": 7450.1640625,
+      "completions/min_length": 910.0,
+      "completions/min_terminated_length": 910.0,
+      "entropy": 1.0400195196270943,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033558050636202097,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 325617687.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999459385871887,
+      "sampling/importance_sampling_ratio/min": 0.039920732378959656,
+      "sampling/sampling_logp_difference/max": 3.2208595275878906,
+      "sampling/sampling_logp_difference/mean": 0.02249298244714737,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 1.3147802746971138e-05,
+      "clip_ratio/high_mean": 3.2869506867427845e-06,
+      "clip_ratio/low_mean": 2.4451034505545977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7737984851228248e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15342.0,
+      "completions/mean_length": 6799.0703125,
+      "completions/mean_terminated_length": 6723.5986328125,
+      "completions/min_length": 1708.0,
+      "completions/min_terminated_length": 1708.0,
+      "entropy": 0.9737623482942581,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005797459278255701,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 326508384.0,
+      "reward": 0.3125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999321699142456,
+      "sampling/importance_sampling_ratio/min": 7.535634836131067e-07,
+      "sampling/sampling_logp_difference/max": 14.0984525680542,
+      "sampling/sampling_logp_difference/mean": 0.021543748676776886,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 3.3594023989280686e-06,
+      "clip_ratio/high_mean": 8.398505997320171e-07,
+      "clip_ratio/low_mean": 2.3457610382138228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4297460981870245e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 7034.3671875,
+      "completions/mean_terminated_length": 6654.30078125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8749603256583214,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002258980879560113,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 327426407.0,
+      "reward": 0.4609375,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999661445617676,
+      "sampling/importance_sampling_ratio/min": 0.008719252422451973,
+      "sampling/sampling_logp_difference/max": 4.742221832275391,
+      "sampling/sampling_logp_difference/mean": 0.01997346058487892,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 2.823375348270929e-05,
+      "clip_ratio/high_mean": 7.058438370677322e-06,
+      "clip_ratio/low_mean": 4.9395109726901865e-05,
+      "clip_ratio/low_min": 1.636556044104509e-05,
+      "clip_ratio/region_mean": 5.6453548268109444e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15240.0,
+      "completions/mean_length": 6623.078125,
+      "completions/mean_terminated_length": 6388.81640625,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.858784057199955,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002420129720121622,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 328292985.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537417411804,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998596906661987,
+      "sampling/importance_sampling_ratio/min": 0.00014900295354891568,
+      "sampling/sampling_logp_difference/max": 8.811544418334961,
+      "sampling/sampling_logp_difference/mean": 0.019645996391773224,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 1.8078507309837732e-05,
+      "clip_ratio/high_mean": 6.468551191574079e-06,
+      "clip_ratio/low_mean": 4.051302585139638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.698157727034413e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15229.0,
+      "completions/mean_length": 5902.4765625,
+      "completions/mean_terminated_length": 5564.36279296875,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.904740035533905,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004107976797968149,
+      "learning_rate": 1e-05,
+      "loss": 0.0824,
+      "num_tokens": 329067006.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3945493996143341,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999526143074036,
+      "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05,
+      "sampling/sampling_logp_difference/max": 11.37439250946045,
+      "sampling/sampling_logp_difference/mean": 0.019582755863666534,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 2.553658168835682e-05,
+      "clip_ratio/high_mean": 7.276365181496658e-06,
+      "clip_ratio/low_mean": 1.7552573126522475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.482893796695862e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6425.6015625,
+      "completions/mean_terminated_length": 6267.5322265625,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.964553713798523,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003208522219210863,
+      "learning_rate": 1e-05,
+      "loss": 0.0164,
+      "num_tokens": 329910691.0,
+      "reward": 0.359375,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999419450759888,
+      "sampling/importance_sampling_ratio/min": 0.00137569778598845,
+      "sampling/sampling_logp_difference/max": 6.588794231414795,
+      "sampling/sampling_logp_difference/mean": 0.021154657006263733,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 6.8712420215888415e-06,
+      "clip_ratio/high_mean": 1.7178105053972104e-06,
+      "clip_ratio/low_mean": 4.0991827404468495e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2709637853022286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 8006.4453125,
+      "completions/mean_terminated_length": 7594.43408203125,
+      "completions/min_length": 1235.0,
+      "completions/min_terminated_length": 1235.0,
+      "entropy": 0.8980336412787437,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002898421371355653,
+      "learning_rate": 1e-05,
+      "loss": 0.0815,
+      "num_tokens": 330956332.0,
+      "reward": 0.4296875,
+      "reward_std": 0.20175684988498688,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 9.378339746035635e-05,
+      "sampling/sampling_logp_difference/max": 9.27452278137207,
+      "sampling/sampling_logp_difference/mean": 0.021021340042352676,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.2689344689297286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2689344689297286e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15484.0,
+      "completions/max_terminated_length": 15484.0,
+      "completions/mean_length": 7068.828125,
+      "completions/mean_terminated_length": 7068.828125,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.9865007549524307,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0037063576746731997,
+      "learning_rate": 1e-05,
+      "loss": 0.0313,
+      "num_tokens": 331880918.0,
+      "reward": 0.3203125,
+      "reward_std": 0.17859892547130585,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0001819290773710236,
+      "sampling/sampling_logp_difference/max": 8.611893653869629,
+      "sampling/sampling_logp_difference/mean": 0.02072504535317421,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 5.845633268108941e-06,
+      "clip_ratio/high_mean": 1.4614083170272352e-06,
+      "clip_ratio/low_mean": 3.207486906831036e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353627721480734e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 7379.390625,
+      "completions/mean_terminated_length": 7236.4609375,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.8977236375212669,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001972826896235347,
+      "learning_rate": 1e-05,
+      "loss": 0.0228,
+      "num_tokens": 332849112.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 2.820451663865242e-05,
+      "sampling/sampling_logp_difference/max": 10.476028442382812,
+      "sampling/sampling_logp_difference/mean": 0.019411223009228706,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 4.875385002378607e-06,
+      "clip_ratio/high_mean": 1.2188462505946518e-06,
+      "clip_ratio/low_mean": 2.3530714997832547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.47495612484272e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15517.0,
+      "completions/mean_length": 6867.9609375,
+      "completions/mean_terminated_length": 6793.03125,
+      "completions/min_length": 760.0,
+      "completions/min_terminated_length": 760.0,
+      "entropy": 0.9244343340396881,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.006926023401319981,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 333746179.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1433562934398651,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.0003875594411510974,
+      "sampling/sampling_logp_difference/max": 7.8556413650512695,
+      "sampling/sampling_logp_difference/mean": 0.020311862230300903,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 1.5651628245905158e-05,
+      "clip_ratio/high_mean": 4.836261211949022e-06,
+      "clip_ratio/low_mean": 5.268017821435933e-05,
+      "clip_ratio/low_min": 3.950945028918795e-06,
+      "clip_ratio/region_mean": 5.751643902840442e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 7525.375,
+      "completions/mean_terminated_length": 6855.3955078125,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9207312315702438,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0047226278111338615,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 334731027.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3353874683380127,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999615550041199,
+      "sampling/importance_sampling_ratio/min": 0.00029753465787507594,
+      "sampling/sampling_logp_difference/max": 8.119979858398438,
+      "sampling/sampling_logp_difference/mean": 0.021496692672371864,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 3.815379886873416e-05,
+      "clip_ratio/high_mean": 9.53844971718354e-06,
+      "clip_ratio/low_mean": 4.519663821156428e-05,
+      "clip_ratio/low_min": 2.775434040813707e-06,
+      "clip_ratio/region_mean": 5.473508826980833e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16251.0,
+      "completions/mean_length": 6841.0625,
+      "completions/mean_terminated_length": 6453.13818359375,
+      "completions/min_length": 689.0,
+      "completions/min_terminated_length": 689.0,
+      "entropy": 0.8979457840323448,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004971448332071304,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 335631243.0,
+      "reward": 0.390625,
+      "reward_std": 0.2596156895160675,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999934196472168,
+      "sampling/importance_sampling_ratio/min": 9.655764188210014e-06,
+      "sampling/sampling_logp_difference/max": 11.547955513000488,
+      "sampling/sampling_logp_difference/mean": 0.020256079733371735,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 4.162365712545579e-06,
+      "clip_ratio/high_mean": 1.0405914281363948e-06,
+      "clip_ratio/low_mean": 3.1563491688757495e-05,
+      "clip_ratio/low_min": 3.1228139505401487e-06,
+      "clip_ratio/region_mean": 3.260408311689389e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15060.0,
+      "completions/mean_length": 6919.8046875,
+      "completions/mean_terminated_length": 6454.35205078125,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9241961911320686,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038604787550866604,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 336537162.0,
+      "reward": 0.375,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998080730438232,
+      "sampling/importance_sampling_ratio/min": 0.0009118975722230971,
+      "sampling/sampling_logp_difference/max": 6.999982833862305,
+      "sampling/sampling_logp_difference/mean": 0.02030865103006363,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 6.5182248363271356e-06,
+      "clip_ratio/high_mean": 1.6295562090817839e-06,
+      "clip_ratio/low_mean": 4.3847362121596234e-05,
+      "clip_ratio/low_min": 6.294533704931382e-06,
+      "clip_ratio/region_mean": 4.547691833067802e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15692.0,
+      "completions/mean_length": 7679.390625,
+      "completions/mean_terminated_length": 7099.08349609375,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "entropy": 1.0165777206420898,
+      "epoch": 0.35418583256669733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004624314606189728,
+      "learning_rate": 1e-05,
+      "loss": 0.0849,
+      "num_tokens": 337542492.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999251961708069,
+      "sampling/importance_sampling_ratio/min": 5.83546279813163e-05,
+      "sampling/sampling_logp_difference/max": 9.748971939086914,
+      "sampling/sampling_logp_difference/mean": 0.02206476218998432,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 6.00499606662197e-06,
+      "clip_ratio/high_mean": 1.5012490166554926e-06,
+      "clip_ratio/low_mean": 3.392923713363416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.543048615028965e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 5957.5859375,
+      "completions/mean_terminated_length": 5792.08740234375,
+      "completions/min_length": 1705.0,
+      "completions/min_terminated_length": 1705.0,
+      "entropy": 0.7705951780080795,
+      "epoch": 0.35510579576816925,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021966886706650257,
+      "learning_rate": 1e-05,
+      "loss": 0.0789,
+      "num_tokens": 338324279.0,
+      "reward": 0.53125,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999998927116394,
+      "sampling/importance_sampling_ratio/min": 0.0008041196851991117,
+      "sampling/sampling_logp_difference/max": 7.125762462615967,
+      "sampling/sampling_logp_difference/mean": 0.01804077997803688,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 1.5711350215497077e-05,
+      "clip_ratio/high_mean": 3.927837553874269e-06,
+      "clip_ratio/low_mean": 5.276240381135722e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.669024130838807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7269.8046875,
+      "completions/mean_terminated_length": 7198.03955078125,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 1.0025205165147781,
+      "epoch": 0.3560257589696412,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001694107661023736,
+      "learning_rate": 1e-05,
+      "loss": 0.134,
+      "num_tokens": 339274662.0,
+      "reward": 0.3359375,
+      "reward_std": 0.30487072467803955,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039769172668,
+      "sampling/importance_sampling_ratio/min": 0.0015677008777856827,
+      "sampling/sampling_logp_difference/max": 6.4581451416015625,
+      "sampling/sampling_logp_difference/mean": 0.021742526441812515,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 7.005848829066963e-06,
+      "clip_ratio/high_mean": 1.7514622072667407e-06,
+      "clip_ratio/low_mean": 5.100632029098051e-05,
+      "clip_ratio/low_min": 8.934973720897688e-06,
+      "clip_ratio/region_mean": 5.275778244140383e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7643.8359375,
+      "completions/mean_terminated_length": 7288.54443359375,
+      "completions/min_length": 1061.0,
+      "completions/min_terminated_length": 1061.0,
+      "entropy": 0.7936615869402885,
+      "epoch": 0.35694572217111314,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004587972536683083,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 340272689.0,
+      "reward": 0.5078125,
+      "reward_std": 0.35324612259864807,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999613761901855,
+      "sampling/importance_sampling_ratio/min": 0.0007390327518805861,
+      "sampling/sampling_logp_difference/max": 7.210168361663818,
+      "sampling/sampling_logp_difference/mean": 0.01862112432718277,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 1.0522736374696251e-05,
+      "clip_ratio/high_mean": 2.6306840936740628e-06,
+      "clip_ratio/low_mean": 2.139122614153166e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4021910121518886e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14401.0,
+      "completions/mean_length": 7068.734375,
+      "completions/mean_terminated_length": 6610.60595703125,
+      "completions/min_length": 775.0,
+      "completions/min_terminated_length": 775.0,
+      "entropy": 0.8858344480395317,
+      "epoch": 0.3578656853725851,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00245783943682909,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 341195599.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21594557166099548,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957263469696,
+      "sampling/importance_sampling_ratio/min": 1.526316918898374e-05,
+      "sampling/sampling_logp_difference/max": 11.090067863464355,
+      "sampling/sampling_logp_difference/mean": 0.019989900290966034,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 5.272259386401856e-06,
+      "clip_ratio/high_mean": 1.318064846600464e-06,
+      "clip_ratio/low_mean": 2.2939096254503966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4257160987417592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15788.0,
+      "completions/mean_length": 6093.296875,
+      "completions/mean_terminated_length": 5929.95263671875,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.9640207663178444,
+      "epoch": 0.35878564857405704,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0067657483741641045,
+      "learning_rate": 1e-05,
+      "loss": 0.0181,
+      "num_tokens": 341993565.0,
+      "reward": 0.4453125,
+      "reward_std": 0.12415502220392227,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998992681503296,
+      "sampling/importance_sampling_ratio/min": 0.010459281504154205,
+      "sampling/sampling_logp_difference/max": 4.56026554107666,
+      "sampling/sampling_logp_difference/mean": 0.02037961222231388,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.566248594528588e-05,
+      "clip_ratio/low_min": 4.402028480399167e-06,
+      "clip_ratio/region_mean": 4.566248594528588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16170.0,
+      "completions/max_terminated_length": 16170.0,
+      "completions/mean_length": 7620.09375,
+      "completions/mean_terminated_length": 7620.09375,
+      "completions/min_length": 1076.0,
+      "completions/min_terminated_length": 1076.0,
+      "entropy": 0.9773544892668724,
+      "epoch": 0.35970561177552896,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018817185191437602,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 342990545.0,
+      "reward": 0.3046875,
+      "reward_std": 0.18755048513412476,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0006883936002850533,
+      "sampling/sampling_logp_difference/max": 7.281149864196777,
+      "sampling/sampling_logp_difference/mean": 0.021528441458940506,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 2.6727505428425502e-05,
+      "clip_ratio/high_mean": 7.985045499481203e-06,
+      "clip_ratio/low_mean": 7.762144696243922e-05,
+      "clip_ratio/low_min": 2.4772080450929934e-05,
+      "clip_ratio/region_mean": 8.560649303035461e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15053.0,
+      "completions/mean_length": 6963.984375,
+      "completions/mean_terminated_length": 6737.904296875,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 0.9683744385838509,
+      "epoch": 0.36062557497700093,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052104732021689415,
+      "learning_rate": 1e-05,
+      "loss": 0.087,
+      "num_tokens": 343898791.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324679374695,
+      "sampling/importance_sampling_ratio/min": 0.010815954767167568,
+      "sampling/sampling_logp_difference/max": 4.526732921600342,
+      "sampling/sampling_logp_difference/mean": 0.021434593945741653,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 1.3545108686230378e-05,
+      "clip_ratio/high_mean": 4.365133804640209e-06,
+      "clip_ratio/low_mean": 2.5377692509209737e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9742826200163108e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15116.0,
+      "completions/mean_length": 6718.5078125,
+      "completions/mean_terminated_length": 6642.4013671875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9043834507465363,
+      "epoch": 0.36154553817847285,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005151392426341772,
+      "learning_rate": 1e-05,
+      "loss": 0.0085,
+      "num_tokens": 344779672.0,
+      "reward": 0.4921875,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999840497970581,
+      "sampling/importance_sampling_ratio/min": 0.0024171893019229174,
+      "sampling/sampling_logp_difference/max": 6.025149822235107,
+      "sampling/sampling_logp_difference/mean": 0.0201373603194952,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 1.2263486723895767e-05,
+      "clip_ratio/high_mean": 3.927679188109323e-06,
+      "clip_ratio/low_mean": 2.739263118201052e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132031042696326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16342.0,
+      "completions/mean_length": 7044.640625,
+      "completions/mean_terminated_length": 6820.49609375,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "entropy": 0.9017335474491119,
+      "epoch": 0.3624655013799448,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026606651954352856,
+      "learning_rate": 1e-05,
+      "loss": 0.0554,
+      "num_tokens": 345701722.0,
+      "reward": 0.3125,
+      "reward_std": 0.24146249890327454,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05,
+      "sampling/sampling_logp_difference/max": 10.157968521118164,
+      "sampling/sampling_logp_difference/mean": 0.01981864869594574,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 1.026556356009678e-05,
+      "clip_ratio/high_mean": 2.566390890024195e-06,
+      "clip_ratio/low_mean": 4.819571529424138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0762106297952414e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15476.0,
+      "completions/mean_length": 6031.875,
+      "completions/mean_terminated_length": 5950.3623046875,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.8537683561444283,
+      "epoch": 0.36338546458141674,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003957017324864864,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 346492810.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999707341194153,
+      "sampling/importance_sampling_ratio/min": 0.0015133036067709327,
+      "sampling/sampling_logp_difference/max": 6.493460178375244,
+      "sampling/sampling_logp_difference/mean": 0.018711457028985023,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 5.870488848813693e-06,
+      "clip_ratio/high_mean": 1.4676222122034233e-06,
+      "clip_ratio/low_mean": 3.637038832948747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.783801014378696e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15942.0,
+      "completions/mean_length": 7429.3515625,
+      "completions/mean_terminated_length": 6911.31396484375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.8821266070008278,
+      "epoch": 0.36430542778288866,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002122648525983095,
+      "learning_rate": 1e-05,
+      "loss": 0.1257,
+      "num_tokens": 347462871.0,
+      "reward": 0.453125,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000076293945312,
+      "sampling/importance_sampling_ratio/min": 0.00014005196862854064,
+      "sampling/sampling_logp_difference/max": 8.873497009277344,
+      "sampling/sampling_logp_difference/mean": 0.01998838409781456,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 1.0663932243915042e-05,
+      "clip_ratio/high_mean": 2.6659830609787605e-06,
+      "clip_ratio/low_mean": 6.443337406381033e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.709935701110226e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15761.0,
+      "completions/mean_length": 7131.7109375,
+      "completions/mean_terminated_length": 6833.25,
+      "completions/min_length": 821.0,
+      "completions/min_terminated_length": 821.0,
+      "entropy": 0.8575824722647667,
+      "epoch": 0.36522539098436063,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002546454081311822,
+      "learning_rate": 1e-05,
+      "loss": 0.0676,
+      "num_tokens": 348395842.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999964714050293,
+      "sampling/importance_sampling_ratio/min": 0.0002167800412280485,
+      "sampling/sampling_logp_difference/max": 8.436627388000488,
+      "sampling/sampling_logp_difference/mean": 0.0193922221660614,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 3.847337666229578e-06,
+      "clip_ratio/high_mean": 9.618344165573944e-07,
+      "clip_ratio/low_mean": 3.932982110654848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.029165563679271e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16200.0,
+      "completions/mean_length": 6858.34375,
+      "completions/mean_terminated_length": 6707.14306640625,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.9539813920855522,
+      "epoch": 0.36614535418583255,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00492837093770504,
+      "learning_rate": 1e-05,
+      "loss": 0.0818,
+      "num_tokens": 349292790.0,
+      "reward": 0.390625,
+      "reward_std": 0.1949220597743988,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998850226402283,
+      "sampling/importance_sampling_ratio/min": 0.0011153683299198747,
+      "sampling/sampling_logp_difference/max": 6.79857063293457,
+      "sampling/sampling_logp_difference/mean": 0.020318543538451195,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 1.291372609557584e-05,
+      "clip_ratio/high_mean": 3.22843152389396e-06,
+      "clip_ratio/low_mean": 3.8245348378040944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1473780811429606e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15261.0,
+      "completions/mean_length": 7809.984375,
+      "completions/mean_terminated_length": 7533.40283203125,
+      "completions/min_length": 1002.0,
+      "completions/min_terminated_length": 1002.0,
+      "entropy": 0.8353303670883179,
+      "epoch": 0.3670653173873045,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004895905964076519,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 350312556.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22567616403102875,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999260306358337,
+      "sampling/importance_sampling_ratio/min": 0.0008417933131568134,
+      "sampling/sampling_logp_difference/max": 7.0799760818481445,
+      "sampling/sampling_logp_difference/mean": 0.018754083663225174,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 1.1250081115576904e-05,
+      "clip_ratio/high_mean": 3.5690324011738994e-06,
+      "clip_ratio/low_mean": 3.196108968950284e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.553012152224255e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15057.0,
+      "completions/mean_length": 7194.9296875,
+      "completions/mean_terminated_length": 6821.39013671875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9744522422552109,
+      "epoch": 0.36798528058877644,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0032397822942584753,
+      "learning_rate": 1e-05,
+      "loss": 0.0402,
+      "num_tokens": 351252755.0,
+      "reward": 0.421875,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998766183853149,
+      "sampling/importance_sampling_ratio/min": 0.00023159870761446655,
+      "sampling/sampling_logp_difference/max": 8.370504379272461,
+      "sampling/sampling_logp_difference/mean": 0.02105094864964485,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 6.980455509619787e-06,
+      "clip_ratio/high_mean": 1.7451138774049468e-06,
+      "clip_ratio/low_mean": 2.2670621888210007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.441573599298863e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 6836.234375,
+      "completions/mean_terminated_length": 6607.08837890625,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.9149863049387932,
+      "epoch": 0.3689052437902484,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031576494220644236,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 352145873.0,
+      "reward": 0.3671875,
+      "reward_std": 0.22225630283355713,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999266862869263,
+      "sampling/importance_sampling_ratio/min": 0.0011975533561781049,
+      "sampling/sampling_logp_difference/max": 6.727474689483643,
+      "sampling/sampling_logp_difference/mean": 0.020445333793759346,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 2.3557336589874467e-05,
+      "clip_ratio/high_mean": 5.889334147468617e-06,
+      "clip_ratio/low_mean": 5.359988131203863e-05,
+      "clip_ratio/low_min": 1.3856095392839052e-05,
+      "clip_ratio/region_mean": 5.9489215118446737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 6942.65625,
+      "completions/mean_terminated_length": 6638.0966796875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "entropy": 0.7541583999991417,
+      "epoch": 0.36982520699172033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003970830701291561,
+      "learning_rate": 1e-05,
+      "loss": 0.051,
+      "num_tokens": 353056405.0,
+      "reward": 0.453125,
+      "reward_std": 0.3282659649848938,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 8.399576472584158e-06,
+      "sampling/sampling_logp_difference/max": 11.687329292297363,
+      "sampling/sampling_logp_difference/mean": 0.018101349472999573,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 2.6139805413549766e-05,
+      "clip_ratio/high_mean": 7.517377525800839e-06,
+      "clip_ratio/low_mean": 1.968103515537223e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7198412681173068e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14786.0,
+      "completions/max_terminated_length": 14786.0,
+      "completions/mean_length": 6022.1875,
+      "completions/mean_terminated_length": 6022.1875,
+      "completions/min_length": 1285.0,
+      "completions/min_terminated_length": 1285.0,
+      "entropy": 0.9535745903849602,
+      "epoch": 0.37074517019319225,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0043656788766384125,
+      "learning_rate": 1e-05,
+      "loss": 0.029,
+      "num_tokens": 353844661.0,
+      "reward": 0.4140625,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.04981832951307297,
+      "sampling/sampling_logp_difference/max": 2.9993722438812256,
+      "sampling/sampling_logp_difference/mean": 0.020655371248722076,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 9.152076700047473e-06,
+      "clip_ratio/high_mean": 2.9508817647183605e-06,
+      "clip_ratio/low_mean": 5.21388310517068e-05,
+      "clip_ratio/low_min": 2.633131089169183e-06,
+      "clip_ratio/region_mean": 5.508971298695542e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15906.0,
+      "completions/mean_length": 8068.96875,
+      "completions/mean_terminated_length": 7869.408203125,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.9473539590835571,
+      "epoch": 0.3716651333946642,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006543307099491358,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 354894689.0,
+      "reward": 0.2578125,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999514818191528,
+      "sampling/importance_sampling_ratio/min": 6.672408926533535e-05,
+      "sampling/sampling_logp_difference/max": 9.614944458007812,
+      "sampling/sampling_logp_difference/mean": 0.021852033212780952,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 2.9619268843816826e-05,
+      "clip_ratio/high_mean": 7.4048172109542065e-06,
+      "clip_ratio/low_mean": 5.5152235972855124e-05,
+      "clip_ratio/low_min": 1.0455875781190116e-05,
+      "clip_ratio/region_mean": 6.255705375224352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15748.0,
+      "completions/mean_length": 5960.1875,
+      "completions/mean_terminated_length": 5878.1103515625,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "entropy": 0.9564141109585762,
+      "epoch": 0.37258509659613614,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003351036459207535,
+      "learning_rate": 1e-05,
+      "loss": 0.0293,
+      "num_tokens": 355677273.0,
+      "reward": 0.46875,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999220371246338,
+      "sampling/importance_sampling_ratio/min": 0.0012859756825491786,
+      "sampling/sampling_logp_difference/max": 6.656237602233887,
+      "sampling/sampling_logp_difference/mean": 0.021779976785182953,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 7.957685966175632e-06,
+      "clip_ratio/high_mean": 1.989421491543908e-06,
+      "clip_ratio/low_mean": 3.758041248147492e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.956983414354909e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15669.0,
+      "completions/mean_length": 7620.21875,
+      "completions/mean_terminated_length": 7189.212890625,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 1.035948596894741,
+      "epoch": 0.3735050597976081,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031219006050378084,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 356675829.0,
+      "reward": 0.296875,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001060962677002,
+      "sampling/importance_sampling_ratio/min": 0.010141897015273571,
+      "sampling/sampling_logp_difference/max": 4.591080188751221,
+      "sampling/sampling_logp_difference/mean": 0.021951109170913696,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 2.286768199155631e-05,
+      "clip_ratio/high_mean": 5.7169204978890775e-06,
+      "clip_ratio/low_mean": 3.914574369900947e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.486266482217616e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14038.0,
+      "completions/mean_length": 5806.0234375,
+      "completions/mean_terminated_length": 5638.119140625,
+      "completions/min_length": 1319.0,
+      "completions/min_terminated_length": 1319.0,
+      "entropy": 0.8977029845118523,
+      "epoch": 0.37442502299908004,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002810312667861581,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 357438712.0,
+      "reward": 0.546875,
+      "reward_std": 0.22832970321178436,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999280571937561,
+      "sampling/importance_sampling_ratio/min": 0.0011738575994968414,
+      "sampling/sampling_logp_difference/max": 6.747459888458252,
+      "sampling/sampling_logp_difference/mean": 0.01965375244617462,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 1.2219379641464911e-05,
+      "clip_ratio/high_mean": 3.054844910366228e-06,
+      "clip_ratio/low_mean": 3.186109779562685e-05,
+      "clip_ratio/low_min": 4.3511558942554984e-06,
+      "clip_ratio/region_mean": 3.4915943160740426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15705.0,
+      "completions/max_terminated_length": 15705.0,
+      "completions/mean_length": 6537.4609375,
+      "completions/mean_terminated_length": 6537.4609375,
+      "completions/min_length": 842.0,
+      "completions/min_terminated_length": 842.0,
+      "entropy": 0.9577726796269417,
+      "epoch": 0.37534498620055196,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004516562446951866,
+      "learning_rate": 1e-05,
+      "loss": 0.0517,
+      "num_tokens": 358296731.0,
+      "reward": 0.3828125,
+      "reward_std": 0.1830746978521347,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999170303344727,
+      "sampling/importance_sampling_ratio/min": 2.384942035860149e-06,
+      "sampling/sampling_logp_difference/max": 12.946335792541504,
+      "sampling/sampling_logp_difference/mean": 0.021242395043373108,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 1.4422689218918094e-05,
+      "clip_ratio/high_mean": 3.6056723047295236e-06,
+      "clip_ratio/low_mean": 3.026239573955536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3868068385345396e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16360.0,
+      "completions/mean_length": 7896.671875,
+      "completions/mean_terminated_length": 7622.88671875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.9163230583071709,
+      "epoch": 0.37626494940202393,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003542230697348714,
+      "learning_rate": 1e-05,
+      "loss": 0.05,
+      "num_tokens": 359327001.0,
+      "reward": 0.375,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998560547828674,
+      "sampling/importance_sampling_ratio/min": 0.00010891625424847007,
+      "sampling/sampling_logp_difference/max": 9.124931335449219,
+      "sampling/sampling_logp_difference/mean": 0.020085681229829788,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 1.7827243254942005e-05,
+      "clip_ratio/high_mean": 5.474494003010477e-06,
+      "clip_ratio/low_mean": 4.2465159026505717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.793965263161226e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15297.0,
+      "completions/mean_length": 6728.7109375,
+      "completions/mean_terminated_length": 6652.68505859375,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9010183215141296,
+      "epoch": 0.37718491260349585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0035069347359240055,
+      "learning_rate": 1e-05,
+      "loss": 0.0518,
+      "num_tokens": 360208780.0,
+      "reward": 0.5390625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999571442604065,
+      "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05,
+      "sampling/sampling_logp_difference/max": 11.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.021022530272603035,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 1.0376989393989788e-05,
+      "clip_ratio/high_mean": 2.594247348497447e-06,
+      "clip_ratio/low_mean": 2.8587513156708155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1181759936771414e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6800.3984375,
+      "completions/mean_terminated_length": 6491.25,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.8654960840940475,
+      "epoch": 0.3781048758049678,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033910400234162807,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 361098567.0,
+      "reward": 0.5625,
+      "reward_std": 0.2306838035583496,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998576641082764,
+      "sampling/importance_sampling_ratio/min": 0.001449413481168449,
+      "sampling/sampling_logp_difference/max": 6.536596298217773,
+      "sampling/sampling_logp_difference/mean": 0.019660964608192444,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 2.3068858354236e-05,
+      "clip_ratio/high_mean": 7.792090059410839e-06,
+      "clip_ratio/low_mean": 5.8515578757578623e-05,
+      "clip_ratio/low_min": 1.0348648629587842e-05,
+      "clip_ratio/region_mean": 6.630766870330262e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7103.4453125,
+      "completions/mean_terminated_length": 6956.13525390625,
+      "completions/min_length": 1711.0,
+      "completions/min_terminated_length": 1711.0,
+      "entropy": 0.8317076042294502,
+      "epoch": 0.37902483900643974,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036110079381614923,
+      "learning_rate": 1e-05,
+      "loss": 0.0834,
+      "num_tokens": 362027520.0,
+      "reward": 0.546875,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999338984489441,
+      "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05,
+      "sampling/sampling_logp_difference/max": 11.458046913146973,
+      "sampling/sampling_logp_difference/mean": 0.01939362846314907,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 3.112394779236638e-06,
+      "clip_ratio/high_mean": 7.780986948091595e-07,
+      "clip_ratio/low_mean": 5.127149995587388e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.204959859383962e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15830.0,
+      "completions/mean_length": 7344.9296875,
+      "completions/mean_terminated_length": 6900.384765625,
+      "completions/min_length": 1368.0,
+      "completions/min_terminated_length": 1368.0,
+      "entropy": 0.8387318029999733,
+      "epoch": 0.37994480220791166,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002141098491847515,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 362985207.0,
+      "reward": 0.34375,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999322891235352,
+      "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05,
+      "sampling/sampling_logp_difference/max": 10.874617576599121,
+      "sampling/sampling_logp_difference/mean": 0.01929464004933834,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 5.2602786126954015e-06,
+      "clip_ratio/high_mean": 1.3150696531738504e-06,
+      "clip_ratio/low_mean": 1.7854434247510653e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9169503786997666e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16137.0,
+      "completions/mean_length": 6377.7734375,
+      "completions/mean_terminated_length": 6218.94482421875,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9732858911156654,
+      "epoch": 0.38086476540938363,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015244127716869116,
+      "learning_rate": 1e-05,
+      "loss": 0.0608,
+      "num_tokens": 363823914.0,
+      "reward": 0.4375,
+      "reward_std": 0.1988610327243805,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 0.006335465237498283,
+      "sampling/sampling_logp_difference/max": 5.061592102050781,
+      "sampling/sampling_logp_difference/mean": 0.020688029006123543,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 2.6195500595349586e-05,
+      "clip_ratio/high_mean": 6.548875148837396e-06,
+      "clip_ratio/low_mean": 3.3802934012783226e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035180882056011e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14456.0,
+      "completions/mean_length": 5599.7890625,
+      "completions/mean_terminated_length": 5340.96826171875,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8872368410229683,
+      "epoch": 0.38178472861085555,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002647512126713991,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 364561127.0,
+      "reward": 0.453125,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999077916145325,
+      "sampling/importance_sampling_ratio/min": 2.370526999584399e-06,
+      "sampling/sampling_logp_difference/max": 12.952398300170898,
+      "sampling/sampling_logp_difference/mean": 0.01878243312239647,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 2.157278959202813e-05,
+      "clip_ratio/high_mean": 5.3931973980070325e-06,
+      "clip_ratio/low_mean": 7.215861739950924e-05,
+      "clip_ratio/low_min": 1.4898997051204788e-05,
+      "clip_ratio/region_mean": 7.755181559332414e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15905.0,
+      "completions/mean_length": 7877.2890625,
+      "completions/mean_terminated_length": 7385.1650390625,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.8416353687644005,
+      "epoch": 0.3827046918123275,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018051012884825468,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 365590124.0,
+      "reward": 0.3125,
+      "reward_std": 0.28407180309295654,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.0004095165350008756,
+      "sampling/sampling_logp_difference/max": 7.800533294677734,
+      "sampling/sampling_logp_difference/mean": 0.019809434190392494,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 2.540994637456606e-05,
+      "clip_ratio/high_mean": 6.352486593641515e-06,
+      "clip_ratio/low_mean": 4.230594890941575e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8658435844117776e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16083.0,
+      "completions/mean_length": 6836.7890625,
+      "completions/mean_terminated_length": 6200.30859375,
+      "completions/min_length": 909.0,
+      "completions/min_terminated_length": 909.0,
+      "entropy": 0.8647575601935387,
+      "epoch": 0.38362465501379944,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004550795070827007,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 366486337.0,
+      "reward": 0.40625,
+      "reward_std": 0.22620806097984314,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873638153076,
+      "sampling/importance_sampling_ratio/min": 0.0001089095021598041,
+      "sampling/sampling_logp_difference/max": 9.124993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01992485672235489,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 1.1592664577619871e-05,
+      "clip_ratio/high_mean": 2.8981661444049678e-06,
+      "clip_ratio/low_mean": 3.5717548257707676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.861571451579948e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16286.0,
+      "completions/mean_length": 6884.953125,
+      "completions/mean_terminated_length": 6417.78662109375,
+      "completions/min_length": 1289.0,
+      "completions/min_terminated_length": 1289.0,
+      "entropy": 0.8691708743572235,
+      "epoch": 0.3845446182152714,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005958946421742439,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 367386163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000011920928955,
+      "sampling/importance_sampling_ratio/min": 9.519772902422119e-06,
+      "sampling/sampling_logp_difference/max": 11.562139511108398,
+      "sampling/sampling_logp_difference/mean": 0.019436441361904144,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 2.7658640192385064e-05,
+      "clip_ratio/high_mean": 8.455849524580117e-06,
+      "clip_ratio/low_mean": 3.938097847822064e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7836828116487595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15574.0,
+      "completions/mean_length": 7439.1328125,
+      "completions/mean_terminated_length": 7150.58837890625,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 0.795464999973774,
+      "epoch": 0.38546458141674333,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00558120384812355,
+      "learning_rate": 1e-05,
+      "loss": 0.1918,
+      "num_tokens": 368357500.0,
+      "reward": 0.609375,
+      "reward_std": 0.3795146346092224,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.0001159337698481977,
+      "sampling/sampling_logp_difference/max": 9.062491416931152,
+      "sampling/sampling_logp_difference/mean": 0.018824251368641853,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 8.509555527780321e-06,
+      "clip_ratio/high_mean": 2.1273888819450804e-06,
+      "clip_ratio/low_mean": 3.0958593640662e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.308598269313734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16236.0,
+      "completions/mean_length": 6751.53125,
+      "completions/mean_terminated_length": 6520.3525390625,
+      "completions/min_length": 715.0,
+      "completions/min_terminated_length": 715.0,
+      "entropy": 0.9450879693031311,
+      "epoch": 0.38638454461821525,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004628168884664774,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 369242920.0,
+      "reward": 0.359375,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999655485153198,
+      "sampling/importance_sampling_ratio/min": 0.0006074689445085824,
+      "sampling/sampling_logp_difference/max": 7.406209468841553,
+      "sampling/sampling_logp_difference/mean": 0.019376013427972794,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 1.8288420505996328e-05,
+      "clip_ratio/high_mean": 4.572105126499082e-06,
+      "clip_ratio/low_mean": 4.86290555272717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320115997164976e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16164.0,
+      "completions/mean_length": 7023.296875,
+      "completions/mean_terminated_length": 6315.3447265625,
+      "completions/min_length": 1628.0,
+      "completions/min_terminated_length": 1628.0,
+      "entropy": 0.7378111630678177,
+      "epoch": 0.3873045078196872,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00389425759203732,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 370159510.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999127388000488,
+      "sampling/importance_sampling_ratio/min": 0.00014012664905749261,
+      "sampling/sampling_logp_difference/max": 8.872963905334473,
+      "sampling/sampling_logp_difference/mean": 0.016914553940296173,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 2.1269573153404053e-05,
+      "clip_ratio/high_mean": 5.948400371380558e-06,
+      "clip_ratio/low_mean": 2.3538930747690756e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9487331687505502e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16018.0,
+      "completions/max_terminated_length": 16018.0,
+      "completions/mean_length": 7702.3046875,
+      "completions/mean_terminated_length": 7702.3046875,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "entropy": 0.9053447172045708,
+      "epoch": 0.38822447102115915,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004324545152485371,
+      "learning_rate": 1e-05,
+      "loss": 0.0149,
+      "num_tokens": 371162773.0,
+      "reward": 0.2421875,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00001060962677,
+      "sampling/importance_sampling_ratio/min": 2.283278627146501e-05,
+      "sampling/sampling_logp_difference/max": 10.687313079833984,
+      "sampling/sampling_logp_difference/mean": 0.020495830103754997,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 1.0294916819475475e-05,
+      "clip_ratio/high_mean": 2.5737292048688687e-06,
+      "clip_ratio/low_mean": 5.831611520079605e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.088984559937671e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 6904.78125,
+      "completions/mean_terminated_length": 6754.31787109375,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.7991176024079323,
+      "epoch": 0.3891444342226311,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003239463549107313,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "num_tokens": 372067241.0,
+      "reward": 0.328125,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00012340991816017777,
+      "sampling/sampling_logp_difference/max": 8.999999046325684,
+      "sampling/sampling_logp_difference/mean": 0.019042208790779114,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 2.7261318791715894e-05,
+      "clip_ratio/high_mean": 7.926559305815317e-06,
+      "clip_ratio/low_mean": 1.552133551285806e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3447895273420727e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15399.0,
+      "completions/mean_length": 6107.7421875,
+      "completions/mean_terminated_length": 5602.35205078125,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "entropy": 0.9495253190398216,
+      "epoch": 0.39006439742410304,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015464330790564418,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 372866072.0,
+      "reward": 0.421875,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971330165863,
+      "sampling/importance_sampling_ratio/min": 0.00024684349773451686,
+      "sampling/sampling_logp_difference/max": 8.306756019592285,
+      "sampling/sampling_logp_difference/mean": 0.019793221727013588,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 2.457227401464479e-05,
+      "clip_ratio/high_mean": 8.533324717063806e-06,
+      "clip_ratio/low_mean": 3.261690835643094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.115023284612107e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15939.0,
+      "completions/mean_length": 6079.8046875,
+      "completions/mean_terminated_length": 5747.4111328125,
+      "completions/min_length": 1082.0,
+      "completions/min_terminated_length": 1082.0,
+      "entropy": 0.8005363270640373,
+      "epoch": 0.39098436062557496,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024811832699924707,
+      "learning_rate": 1e-05,
+      "loss": 0.1124,
+      "num_tokens": 373663463.0,
+      "reward": 0.625,
+      "reward_std": 0.2630355656147003,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743103981018,
+      "sampling/importance_sampling_ratio/min": 0.00019348970090504736,
+      "sampling/sampling_logp_difference/max": 8.550286293029785,
+      "sampling/sampling_logp_difference/mean": 0.017151469364762306,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 3.3719989005476236e-06,
+      "clip_ratio/high_mean": 8.429997251369059e-07,
+      "clip_ratio/low_mean": 2.132218082806503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2165180553201935e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14925.0,
+      "completions/mean_length": 6453.7890625,
+      "completions/mean_terminated_length": 6375.5986328125,
+      "completions/min_length": 347.0,
+      "completions/min_terminated_length": 347.0,
+      "entropy": 0.9212624430656433,
+      "epoch": 0.39190432382704693,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031475063879042864,
+      "learning_rate": 1e-05,
+      "loss": 0.0959,
+      "num_tokens": 374517492.0,
+      "reward": 0.34375,
+      "reward_std": 0.19910329580307007,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999594688415527,
+      "sampling/importance_sampling_ratio/min": 0.015664709731936455,
+      "sampling/sampling_logp_difference/max": 4.156344890594482,
+      "sampling/sampling_logp_difference/mean": 0.019899867475032806,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 1.907509408738406e-05,
+      "clip_ratio/high_mean": 5.984868664654641e-06,
+      "clip_ratio/low_mean": 3.784128080042137e-05,
+      "clip_ratio/low_min": 3.7751804029539926e-06,
+      "clip_ratio/region_mean": 4.382614952191943e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16159.0,
+      "completions/max_terminated_length": 16159.0,
+      "completions/mean_length": 6126.9921875,
+      "completions/mean_terminated_length": 6126.9921875,
+      "completions/min_length": 1106.0,
+      "completions/min_terminated_length": 1106.0,
+      "entropy": 0.8252849578857422,
+      "epoch": 0.39282428702851885,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004200868774205446,
+      "learning_rate": 1e-05,
+      "loss": 0.0276,
+      "num_tokens": 375320339.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999815225601196,
+      "sampling/importance_sampling_ratio/min": 0.005763276945799589,
+      "sampling/sampling_logp_difference/max": 5.156249046325684,
+      "sampling/sampling_logp_difference/mean": 0.01833093911409378,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 1.8918785372079583e-05,
+      "clip_ratio/high_mean": 5.476571459439583e-06,
+      "clip_ratio/low_mean": 6.169724406390742e-05,
+      "clip_ratio/low_min": 7.494657666029525e-06,
+      "clip_ratio/region_mean": 6.717381506859965e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15411.0,
+      "completions/mean_length": 6739.09375,
+      "completions/mean_terminated_length": 6427.9677734375,
+      "completions/min_length": 1228.0,
+      "completions/min_terminated_length": 1228.0,
+      "entropy": 0.8008574098348618,
+      "epoch": 0.3937442502299908,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003204014617949724,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 376201015.0,
+      "reward": 0.5390625,
+      "reward_std": 0.37086254358291626,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998303651809692,
+      "sampling/importance_sampling_ratio/min": 0.00010144581028725952,
+      "sampling/sampling_logp_difference/max": 9.195985794067383,
+      "sampling/sampling_logp_difference/mean": 0.018961725756525993,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 1.3558789078160771e-05,
+      "clip_ratio/high_mean": 3.389697269540193e-06,
+      "clip_ratio/low_mean": 5.3925050679026754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.731474743697618e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15634.0,
+      "completions/mean_length": 7245.8984375,
+      "completions/mean_terminated_length": 6951.12060546875,
+      "completions/min_length": 1306.0,
+      "completions/min_terminated_length": 1306.0,
+      "entropy": 1.0351596996188164,
+      "epoch": 0.39466421343146274,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0039763906970620155,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 377149650.0,
+      "reward": 0.375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000600814819336,
+      "sampling/importance_sampling_ratio/min": 8.106228051474318e-05,
+      "sampling/sampling_logp_difference/max": 9.420292854309082,
+      "sampling/sampling_logp_difference/mean": 0.020948028191924095,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 1.4580486549675697e-05,
+      "clip_ratio/high_mean": 4.259903903403028e-06,
+      "clip_ratio/low_mean": 4.6149686397711775e-05,
+      "clip_ratio/low_min": 3.006686938533676e-06,
+      "clip_ratio/region_mean": 5.04095905853319e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 6958.625,
+      "completions/mean_terminated_length": 6495.08154296875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.8360240310430527,
+      "epoch": 0.39558417663293466,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0031417158897966146,
+      "learning_rate": 1e-05,
+      "loss": 0.0195,
+      "num_tokens": 378057802.0,
+      "reward": 0.515625,
+      "reward_std": 0.35771697759628296,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999384880065918,
+      "sampling/importance_sampling_ratio/min": 0.00010235882655251771,
+      "sampling/sampling_logp_difference/max": 9.187026023864746,
+      "sampling/sampling_logp_difference/mean": 0.019185224547982216,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 6.681633749394678e-06,
+      "clip_ratio/high_mean": 1.6704084373486694e-06,
+      "clip_ratio/low_mean": 5.096616632727091e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.263657521936693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15410.0,
+      "completions/max_terminated_length": 15410.0,
+      "completions/mean_length": 5696.3984375,
+      "completions/mean_terminated_length": 5696.3984375,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.7887749597430229,
+      "epoch": 0.39650413983440663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004943124484270811,
+      "learning_rate": 1e-05,
+      "loss": 0.096,
+      "num_tokens": 378808021.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999057054519653,
+      "sampling/importance_sampling_ratio/min": 0.0015042300801724195,
+      "sampling/sampling_logp_difference/max": 6.499474048614502,
+      "sampling/sampling_logp_difference/mean": 0.018845941871404648,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 1.7526824194646906e-05,
+      "clip_ratio/high_mean": 5.417880970526312e-06,
+      "clip_ratio/low_mean": 3.513921649300755e-05,
+      "clip_ratio/low_min": 6.075038982089609e-06,
+      "clip_ratio/region_mean": 4.0557096895099676e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14233.0,
+      "completions/mean_length": 6480.8828125,
+      "completions/mean_terminated_length": 6323.69091796875,
+      "completions/min_length": 1013.0,
+      "completions/min_terminated_length": 1013.0,
+      "entropy": 0.8796411231160164,
+      "epoch": 0.39742410303587855,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00595651101320982,
+      "learning_rate": 1e-05,
+      "loss": 0.0546,
+      "num_tokens": 379659710.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 0.0017907419241964817,
+      "sampling/sampling_logp_difference/max": 6.325125217437744,
+      "sampling/sampling_logp_difference/mean": 0.01906527951359749,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4512424602107785e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4512424602107785e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7501.703125,
+      "completions/mean_terminated_length": 6829.93310546875,
+      "completions/min_length": 680.0,
+      "completions/min_terminated_length": 680.0,
+      "entropy": 0.786028303205967,
+      "epoch": 0.3983440662373505,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0024527597706764936,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 380640720.0,
+      "reward": 0.5234375,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999595880508423,
+      "sampling/importance_sampling_ratio/min": 8.851602615322918e-07,
+      "sampling/sampling_logp_difference/max": 13.93749713897705,
+      "sampling/sampling_logp_difference/mean": 0.01873261108994484,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 1.4606259583160863e-05,
+      "clip_ratio/high_mean": 5.505394312876888e-06,
+      "clip_ratio/low_mean": 3.1679782978244475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7185177234277944e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15185.0,
+      "completions/mean_length": 5619.2890625,
+      "completions/mean_terminated_length": 5448.4208984375,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.8098893761634827,
+      "epoch": 0.39926402943882244,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004280989523977041,
+      "learning_rate": 1e-05,
+      "loss": 0.0514,
+      "num_tokens": 381377981.0,
+      "reward": 0.609375,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999443292617798,
+      "sampling/importance_sampling_ratio/min": 0.0010248658945783973,
+      "sampling/sampling_logp_difference/max": 6.883193492889404,
+      "sampling/sampling_logp_difference/mean": 0.017923470586538315,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 1.4808703554081148e-05,
+      "clip_ratio/high_mean": 3.702175888520287e-06,
+      "clip_ratio/low_mean": 2.3637440563106793e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7339616224253405e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5243.8203125,
+      "completions/mean_terminated_length": 5156.1025390625,
+      "completions/min_length": 576.0,
+      "completions/min_terminated_length": 576.0,
+      "entropy": 0.7485036551952362,
+      "epoch": 0.40018399264029436,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004721642471849918,
+      "learning_rate": 1e-05,
+      "loss": 0.0877,
+      "num_tokens": 382070478.0,
+      "reward": 0.6875,
+      "reward_std": 0.26538965106010437,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999414086341858,
+      "sampling/importance_sampling_ratio/min": 0.0011518355458974838,
+      "sampling/sampling_logp_difference/max": 6.7663984298706055,
+      "sampling/sampling_logp_difference/mean": 0.016579966992139816,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 3.1177480195765384e-05,
+      "clip_ratio/high_mean": 1.1174359769938746e-05,
+      "clip_ratio/low_mean": 3.602651599976525e-05,
+      "clip_ratio/low_min": 4.348733455117326e-06,
+      "clip_ratio/region_mean": 4.720087713394605e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15978.0,
+      "completions/mean_length": 7021.1796875,
+      "completions/mean_terminated_length": 6872.56396484375,
+      "completions/min_length": 1371.0,
+      "completions/min_terminated_length": 1371.0,
+      "entropy": 0.8693460151553154,
+      "epoch": 0.40110395584176634,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00329192029312253,
+      "learning_rate": 1e-05,
+      "loss": 0.0342,
+      "num_tokens": 382990245.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999822378158569,
+      "sampling/importance_sampling_ratio/min": 0.0023386883549392223,
+      "sampling/sampling_logp_difference/max": 6.058165073394775,
+      "sampling/sampling_logp_difference/mean": 0.019863136112689972,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 1.1192694955752813e-05,
+      "clip_ratio/high_mean": 2.7981737389382033e-06,
+      "clip_ratio/low_mean": 4.9078003257818636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.1876177280973934e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15344.0,
+      "completions/mean_length": 6917.625,
+      "completions/mean_terminated_length": 6452.0654296875,
+      "completions/min_length": 945.0,
+      "completions/min_terminated_length": 945.0,
+      "entropy": 0.8466897681355476,
+      "epoch": 0.40202391904323825,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0051889242604374886,
+      "learning_rate": 1e-05,
+      "loss": 0.1009,
+      "num_tokens": 383896717.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999983310699463,
+      "sampling/importance_sampling_ratio/min": 0.00015846389578655362,
+      "sampling/sampling_logp_difference/max": 8.749983787536621,
+      "sampling/sampling_logp_difference/mean": 0.019528398290276527,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 2.3224948108691024e-05,
+      "clip_ratio/high_mean": 8.263948757303297e-06,
+      "clip_ratio/low_mean": 3.8556312347282073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.682026019509067e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16175.0,
+      "completions/mean_length": 7487.5078125,
+      "completions/mean_terminated_length": 7346.2939453125,
+      "completions/min_length": 877.0,
+      "completions/min_terminated_length": 877.0,
+      "entropy": 0.9584660083055496,
+      "epoch": 0.4029438822447102,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002855573548004031,
+      "learning_rate": 1e-05,
+      "loss": 0.0087,
+      "num_tokens": 384872622.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2477683424949646,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999386668205261,
+      "sampling/importance_sampling_ratio/min": 0.0038593418430536985,
+      "sampling/sampling_logp_difference/max": 5.557258605957031,
+      "sampling/sampling_logp_difference/mean": 0.0209865253418684,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 6.171620498207631e-06,
+      "clip_ratio/high_mean": 1.5429051245519076e-06,
+      "clip_ratio/low_mean": 2.98128834401723e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.135578845103737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16092.0,
+      "completions/mean_length": 6637.5078125,
+      "completions/mean_terminated_length": 6323.1044921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 0.8841215297579765,
+      "epoch": 0.40386384544618215,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004437311552464962,
+      "learning_rate": 1e-05,
+      "loss": 0.0523,
+      "num_tokens": 385744023.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999136924743652,
+      "sampling/importance_sampling_ratio/min": 0.002925124252215028,
+      "sampling/sampling_logp_difference/max": 5.834418296813965,
+      "sampling/sampling_logp_difference/mean": 0.019490888342261314,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 1.3304874300956726e-05,
+      "clip_ratio/high_mean": 3.3262185752391815e-06,
+      "clip_ratio/low_mean": 5.443932013804442e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.776553894065728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15143.0,
+      "completions/mean_length": 5965.9765625,
+      "completions/mean_terminated_length": 5800.611328125,
+      "completions/min_length": 621.0,
+      "completions/min_terminated_length": 621.0,
+      "entropy": 0.8726934269070625,
+      "epoch": 0.4047838086476541,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002463799435645342,
+      "learning_rate": 1e-05,
+      "loss": -0.0075,
+      "num_tokens": 386525492.0,
+      "reward": 0.3984375,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.00020367901015561074,
+      "sampling/sampling_logp_difference/max": 8.4989652633667,
+      "sampling/sampling_logp_difference/mean": 0.01946769654750824,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 1.0084711902891286e-05,
+      "clip_ratio/high_mean": 3.6154040117253317e-06,
+      "clip_ratio/low_mean": 3.598771945689805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9603123695997056e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6693.109375,
+      "completions/mean_terminated_length": 6616.80322265625,
+      "completions/min_length": 1704.0,
+      "completions/min_terminated_length": 1704.0,
+      "entropy": 0.9430640190839767,
+      "epoch": 0.40570377184912604,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038990566972643137,
+      "learning_rate": 1e-05,
+      "loss": 0.0415,
+      "num_tokens": 387404842.0,
+      "reward": 0.421875,
+      "reward_std": 0.31587693095207214,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999700784683228,
+      "sampling/importance_sampling_ratio/min": 0.0011708902893587947,
+      "sampling/sampling_logp_difference/max": 6.749990940093994,
+      "sampling/sampling_logp_difference/mean": 0.020848294720053673,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 7.462686426151777e-06,
+      "clip_ratio/high_mean": 1.8656716065379442e-06,
+      "clip_ratio/low_mean": 5.234285907818048e-05,
+      "clip_ratio/low_min": 4.47803950009984e-06,
+      "clip_ratio/region_mean": 5.420853057103159e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7045.6953125,
+      "completions/mean_terminated_length": 6505.46240234375,
+      "completions/min_length": 926.0,
+      "completions/min_terminated_length": 926.0,
+      "entropy": 0.8912066072225571,
+      "epoch": 0.40662373505059796,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018510994268581271,
+      "learning_rate": 1e-05,
+      "loss": 0.099,
+      "num_tokens": 388324475.0,
+      "reward": 0.40625,
+      "reward_std": 0.32195523381233215,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999024868011475,
+      "sampling/importance_sampling_ratio/min": 0.0031757301185280085,
+      "sampling/sampling_logp_difference/max": 5.752217769622803,
+      "sampling/sampling_logp_difference/mean": 0.020547039806842804,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 2.504527083146968e-05,
+      "clip_ratio/high_mean": 6.26131770786742e-06,
+      "clip_ratio/low_mean": 6.165269871871715e-05,
+      "clip_ratio/low_min": 3.5272871627967106e-06,
+      "clip_ratio/region_mean": 6.791401551708987e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15734.0,
+      "completions/mean_length": 7480.0078125,
+      "completions/mean_terminated_length": 7266.3125,
+      "completions/min_length": 1130.0,
+      "completions/min_terminated_length": 1130.0,
+      "entropy": 0.8813760280609131,
+      "epoch": 0.40754369825206993,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004439481534063816,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 389305644.0,
+      "reward": 0.34375,
+      "reward_std": 0.31300368905067444,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999762773513794,
+      "sampling/importance_sampling_ratio/min": 0.007449973840266466,
+      "sampling/sampling_logp_difference/max": 4.899544715881348,
+      "sampling/sampling_logp_difference/mean": 0.01973455585539341,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 4.0980917219712865e-06,
+      "clip_ratio/high_mean": 1.0245229304928216e-06,
+      "clip_ratio/low_mean": 3.662567087303614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.76501939172158e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15302.0,
+      "completions/max_terminated_length": 15302.0,
+      "completions/mean_length": 7044.4453125,
+      "completions/mean_terminated_length": 7044.4453125,
+      "completions/min_length": 1229.0,
+      "completions/min_terminated_length": 1229.0,
+      "entropy": 0.9901906549930573,
+      "epoch": 0.40846366145354185,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.004181519150733948,
+      "learning_rate": 1e-05,
+      "loss": -0.0068,
+      "num_tokens": 390229373.0,
+      "reward": 0.421875,
+      "reward_std": 0.17700131237506866,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000314712524414,
+      "sampling/importance_sampling_ratio/min": 0.00022536676260642707,
+      "sampling/sampling_logp_difference/max": 8.397781372070312,
+      "sampling/sampling_logp_difference/mean": 0.021211043000221252,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 1.4909872106727562e-05,
+      "clip_ratio/high_mean": 3.7274680266818905e-06,
+      "clip_ratio/low_mean": 5.29995777469594e-05,
+      "clip_ratio/low_min": 3.708758640641463e-06,
+      "clip_ratio/region_mean": 5.672704537573736e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7815.8125,
+      "completions/mean_terminated_length": 7244.6005859375,
+      "completions/min_length": 1350.0,
+      "completions/min_terminated_length": 1350.0,
+      "entropy": 0.8278292864561081,
+      "epoch": 0.4093836246550138,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002691390924155712,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 391251141.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31222954392433167,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 0.007715471088886261,
+      "sampling/sampling_logp_difference/max": 4.864527702331543,
+      "sampling/sampling_logp_difference/mean": 0.018415704369544983,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 2.1858722902834415e-05,
+      "clip_ratio/high_mean": 6.629899417021079e-06,
+      "clip_ratio/low_mean": 3.196247394043894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.859237290271267e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15202.0,
+      "completions/mean_length": 5305.1796875,
+      "completions/mean_terminated_length": 5217.94482421875,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.8100772425532341,
+      "epoch": 0.41030358785648574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0069543467834591866,
+      "learning_rate": 1e-05,
+      "loss": 0.1153,
+      "num_tokens": 391956196.0,
+      "reward": 0.609375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000190734863281,
+      "sampling/importance_sampling_ratio/min": 0.0024869756307452917,
+      "sampling/sampling_logp_difference/max": 5.996687889099121,
+      "sampling/sampling_logp_difference/mean": 0.017318082973361015,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 2.461934036546154e-05,
+      "clip_ratio/high_mean": 8.056288947955181e-06,
+      "clip_ratio/low_mean": 5.289376917971822e-05,
+      "clip_ratio/low_min": 4.21926688431995e-06,
+      "clip_ratio/region_mean": 6.0950058468733914e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15300.0,
+      "completions/mean_length": 7299.578125,
+      "completions/mean_terminated_length": 6930.29248046875,
+      "completions/min_length": 1008.0,
+      "completions/min_terminated_length": 1008.0,
+      "entropy": 0.9955824315547943,
+      "epoch": 0.41122355105795766,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0065611582249403,
+      "learning_rate": 1e-05,
+      "loss": 0.0883,
+      "num_tokens": 392908430.0,
+      "reward": 0.4375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999696016311646,
+      "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06,
+      "sampling/sampling_logp_difference/max": 11.873339653015137,
+      "sampling/sampling_logp_difference/mean": 0.02127375639975071,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 2.4339562514796853e-05,
+      "clip_ratio/high_mean": 7.412756531266496e-06,
+      "clip_ratio/low_mean": 3.89272447591793e-05,
+      "clip_ratio/low_min": 4.047796210215893e-06,
+      "clip_ratio/region_mean": 4.6340001517819474e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 6702.9375,
+      "completions/mean_terminated_length": 6390.64501953125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.82919991761446,
+      "epoch": 0.41214351425942963,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032975098583847284,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 393788286.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999115467071533,
+      "sampling/importance_sampling_ratio/min": 0.00028582560480572283,
+      "sampling/sampling_logp_difference/max": 8.160128593444824,
+      "sampling/sampling_logp_difference/mean": 0.019461583346128464,
+      "step": 448
+    },
+    {
+      "clip_ratio/high_max": 2.3807599063729867e-05,
+      "clip_ratio/high_mean": 5.951899765932467e-06,
+      "clip_ratio/low_mean": 3.195798365140945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.790988330365508e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15244.0,
+      "completions/mean_length": 6468.9453125,
+      "completions/mean_terminated_length": 5536.7607421875,
+      "completions/min_length": 808.0,
+      "completions/min_terminated_length": 808.0,
+      "entropy": 0.6471721827983856,
+      "epoch": 0.41306347746090155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032787907402962446,
+      "learning_rate": 1e-05,
+      "loss": 0.1149,
+      "num_tokens": 394638159.0,
+      "reward": 0.625,
+      "reward_std": 0.25354722142219543,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 0.00012341380352154374,
+      "sampling/sampling_logp_difference/max": 8.999967575073242,
+      "sampling/sampling_logp_difference/mean": 0.016151495277881622,
+      "step": 449
+    },
+    {
+      "clip_ratio/high_max": 2.247072688987828e-05,
+      "clip_ratio/high_mean": 5.61768172246957e-06,
+      "clip_ratio/low_mean": 6.035319393049576e-05,
+      "clip_ratio/low_min": 4.063190772285452e-06,
+      "clip_ratio/region_mean": 6.597087667614687e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15931.0,
+      "completions/mean_length": 6547.3203125,
+      "completions/mean_terminated_length": 6230.0078125,
+      "completions/min_length": 587.0,
+      "completions/min_terminated_length": 587.0,
+      "entropy": 0.9123960956931114,
+      "epoch": 0.4139834406623735,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038375966250896454,
+      "learning_rate": 1e-05,
+      "loss": 0.0967,
+      "num_tokens": 395493872.0,
+      "reward": 0.4296875,
+      "reward_std": 0.30798619985580444,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.00016009423416107893,
+      "sampling/sampling_logp_difference/max": 8.739748001098633,
+      "sampling/sampling_logp_difference/mean": 0.019957344979047775,
+      "step": 450
+    },
+    {
+      "clip_ratio/high_max": 1.404482372890925e-05,
+      "clip_ratio/high_mean": 3.5112059322273126e-06,
+      "clip_ratio/low_mean": 2.315102483407827e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6662230766305584e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15058.0,
+      "completions/mean_length": 6291.859375,
+      "completions/mean_terminated_length": 6131.6669921875,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 0.9841655194759369,
+      "epoch": 0.41490340386384544,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003903903067111969,
+      "learning_rate": 1e-05,
+      "loss": 0.0656,
+      "num_tokens": 396320254.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2569621503353119,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 6.564632712979801e-06,
+      "sampling/sampling_logp_difference/max": 11.93381404876709,
+      "sampling/sampling_logp_difference/mean": 0.020753150805830956,
+      "step": 451
+    },
+    {
+      "clip_ratio/high_max": 1.5189204987109406e-05,
+      "clip_ratio/high_mean": 4.615214265868417e-06,
+      "clip_ratio/low_mean": 3.547988831087423e-05,
+      "clip_ratio/low_min": 3.3967392027989263e-06,
+      "clip_ratio/region_mean": 4.009510257674265e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15966.0,
+      "completions/mean_length": 7692.4296875,
+      "completions/mean_terminated_length": 7339.11376953125,
+      "completions/min_length": 1269.0,
+      "completions/min_terminated_length": 1269.0,
+      "entropy": 0.94080401211977,
+      "epoch": 0.41582336706531736,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005152889993041754,
+      "learning_rate": 1e-05,
+      "loss": 0.0511,
+      "num_tokens": 397327029.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 5.027571751270443e-05,
+      "sampling/sampling_logp_difference/max": 9.897988319396973,
+      "sampling/sampling_logp_difference/mean": 0.02036213129758835,
+      "step": 452
+    },
+    {
+      "clip_ratio/high_max": 1.733157705530175e-05,
+      "clip_ratio/high_mean": 6.0586507970583625e-06,
+      "clip_ratio/low_mean": 2.335082047011383e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9409470812424843e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15305.0,
+      "completions/mean_length": 6968.0859375,
+      "completions/mean_terminated_length": 6742.1044921875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9254838973283768,
+      "epoch": 0.41674333026678934,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035838852636516094,
+      "learning_rate": 1e-05,
+      "loss": 0.0182,
+      "num_tokens": 398237536.0,
+      "reward": 0.484375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.002404628787189722,
+      "sampling/sampling_logp_difference/max": 6.030359745025635,
+      "sampling/sampling_logp_difference/mean": 0.020200733095407486,
+      "step": 453
+    },
+    {
+      "clip_ratio/high_max": 4.464923677005572e-06,
+      "clip_ratio/high_mean": 1.116230919251393e-06,
+      "clip_ratio/low_mean": 3.311113533754906e-05,
+      "clip_ratio/low_min": 6.725854291289579e-06,
+      "clip_ratio/region_mean": 3.422736637048729e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16309.0,
+      "completions/mean_length": 8711.078125,
+      "completions/mean_terminated_length": 8199.55078125,
+      "completions/min_length": 1049.0,
+      "completions/min_terminated_length": 1049.0,
+      "entropy": 0.8735406622290611,
+      "epoch": 0.41766329346826125,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0036290446296334267,
+      "learning_rate": 1e-05,
+      "loss": 0.0412,
+      "num_tokens": 399373298.0,
+      "reward": 0.359375,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000042200088501,
+      "sampling/importance_sampling_ratio/min": 9.216561011271551e-05,
+      "sampling/sampling_logp_difference/max": 9.291923522949219,
+      "sampling/sampling_logp_difference/mean": 0.0201371181756258,
+      "step": 454
+    },
+    {
+      "clip_ratio/high_max": 3.4702664606811595e-05,
+      "clip_ratio/high_mean": 8.675666151702899e-06,
+      "clip_ratio/low_mean": 3.3217100849469716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.189276808119757e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14737.0,
+      "completions/mean_length": 6891.078125,
+      "completions/mean_terminated_length": 6663.24853515625,
+      "completions/min_length": 827.0,
+      "completions/min_terminated_length": 827.0,
+      "entropy": 0.8689641878008842,
+      "epoch": 0.41858325666973323,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004067540634423494,
+      "learning_rate": 1e-05,
+      "loss": 0.0633,
+      "num_tokens": 400273708.0,
+      "reward": 0.484375,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999425411224365,
+      "sampling/importance_sampling_ratio/min": 4.0002717582865444e-07,
+      "sampling/sampling_logp_difference/max": 14.731733322143555,
+      "sampling/sampling_logp_difference/mean": 0.019800148904323578,
+      "step": 455
+    },
+    {
+      "clip_ratio/high_max": 2.939170826721238e-06,
+      "clip_ratio/high_mean": 7.347927066803095e-07,
+      "clip_ratio/low_mean": 3.564125790944672e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6376050502440194e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15234.0,
+      "completions/mean_length": 6899.3515625,
+      "completions/mean_terminated_length": 6748.8017578125,
+      "completions/min_length": 1149.0,
+      "completions/min_terminated_length": 1149.0,
+      "entropy": 0.9442604705691338,
+      "epoch": 0.41950321987120515,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026191689539700747,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 401177497.0,
+      "reward": 0.46875,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 0.0017910725437104702,
+      "sampling/sampling_logp_difference/max": 6.3249406814575195,
+      "sampling/sampling_logp_difference/mean": 0.021380646154284477,
+      "step": 456
+    },
+    {
+      "clip_ratio/high_max": 8.99604128790088e-06,
+      "clip_ratio/high_mean": 2.24901032197522e-06,
+      "clip_ratio/low_mean": 2.57235833487357e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.797259367071092e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16226.0,
+      "completions/mean_length": 7175.8359375,
+      "completions/mean_terminated_length": 7029.6748046875,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.8653769046068192,
+      "epoch": 0.4204231830726771,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003141516586765647,
+      "learning_rate": 1e-05,
+      "loss": 0.0674,
+      "num_tokens": 402115812.0,
+      "reward": 0.4375,
+      "reward_std": 0.21040895581245422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999862909317017,
+      "sampling/importance_sampling_ratio/min": 0.001265019178390503,
+      "sampling/sampling_logp_difference/max": 6.672667980194092,
+      "sampling/sampling_logp_difference/mean": 0.01970163732767105,
+      "step": 457
+    },
+    {
+      "clip_ratio/high_max": 1.0800059499160852e-05,
+      "clip_ratio/high_mean": 2.700014874790213e-06,
+      "clip_ratio/low_mean": 3.116219727417047e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3862211807900167e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 7090.8515625,
+      "completions/mean_terminated_length": 6791.072265625,
+      "completions/min_length": 606.0,
+      "completions/min_terminated_length": 606.0,
+      "entropy": 0.9437825232744217,
+      "epoch": 0.42134314627414904,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001980370609089732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 403048385.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 1.4011449138706666e-06,
+      "sampling/sampling_logp_difference/max": 13.47822093963623,
+      "sampling/sampling_logp_difference/mean": 0.021090596914291382,
+      "step": 458
+    },
+    {
+      "clip_ratio/high_max": 2.5482850560365478e-05,
+      "clip_ratio/high_mean": 6.370712640091369e-06,
+      "clip_ratio/low_mean": 4.8558076969129615e-05,
+      "clip_ratio/low_min": 4.8952420002024155e-06,
+      "clip_ratio/region_mean": 5.4928788131292094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16175.0,
+      "completions/mean_length": 7033.65625,
+      "completions/mean_terminated_length": 6809.24853515625,
+      "completions/min_length": 1007.0,
+      "completions/min_terminated_length": 1007.0,
+      "entropy": 0.8789731040596962,
+      "epoch": 0.42226310947562096,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003833206370472908,
+      "learning_rate": 1e-05,
+      "loss": 0.059,
+      "num_tokens": 403968037.0,
+      "reward": 0.46875,
+      "reward_std": 0.28460076451301575,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000317096710205,
+      "sampling/importance_sampling_ratio/min": 0.0021942879538983107,
+      "sampling/sampling_logp_difference/max": 6.1218976974487305,
+      "sampling/sampling_logp_difference/mean": 0.019913772121071815,
+      "step": 459
+    },
+    {
+      "clip_ratio/high_max": 4.068877842655638e-06,
+      "clip_ratio/high_mean": 1.0172194606639096e-06,
+      "clip_ratio/low_mean": 6.774969961043098e-05,
+      "clip_ratio/low_min": 3.189914878021227e-06,
+      "clip_ratio/region_mean": 6.876691895740805e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16107.0,
+      "completions/mean_length": 6992.8984375,
+      "completions/mean_terminated_length": 6611.14599609375,
+      "completions/min_length": 754.0,
+      "completions/min_terminated_length": 754.0,
+      "entropy": 0.857115626335144,
+      "epoch": 0.42318307267709293,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005315023008733988,
+      "learning_rate": 1e-05,
+      "loss": 0.1581,
+      "num_tokens": 404881584.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000758171081543,
+      "sampling/importance_sampling_ratio/min": 4.546630952972919e-05,
+      "sampling/sampling_logp_difference/max": 9.998538970947266,
+      "sampling/sampling_logp_difference/mean": 0.01872519962489605,
+      "step": 460
+    },
+    {
+      "clip_ratio/high_max": 1.167047457784065e-05,
+      "clip_ratio/high_mean": 2.9176186444601626e-06,
+      "clip_ratio/low_mean": 3.3195502112448594e-05,
+      "clip_ratio/low_min": 5.25188033861923e-06,
+      "clip_ratio/region_mean": 3.611312064322192e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 6623.2578125,
+      "completions/mean_terminated_length": 6226.4794921875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "entropy": 0.8803941905498505,
+      "epoch": 0.42410303587856485,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0074885934591293335,
+      "learning_rate": 1e-05,
+      "loss": 0.1076,
+      "num_tokens": 405749105.0,
+      "reward": 0.515625,
+      "reward_std": 0.25354722142219543,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.0011723897187039256,
+      "sampling/sampling_logp_difference/max": 6.748711109161377,
+      "sampling/sampling_logp_difference/mean": 0.01930626854300499,
+      "step": 461
+    },
+    {
+      "clip_ratio/high_max": 4.11753080697963e-06,
+      "clip_ratio/high_mean": 1.0293827017449075e-06,
+      "clip_ratio/low_mean": 5.09268712676203e-05,
+      "clip_ratio/low_min": 1.1170248626513057e-05,
+      "clip_ratio/region_mean": 5.195625465148623e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15032.0,
+      "completions/mean_length": 7244.8203125,
+      "completions/mean_terminated_length": 6647.5419921875,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.9202689751982689,
+      "epoch": 0.4250229990800368,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003960717935115099,
+      "learning_rate": 1e-05,
+      "loss": 0.0536,
+      "num_tokens": 406704618.0,
+      "reward": 0.484375,
+      "reward_std": 0.2880108058452606,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 1.69715603988152e-05,
+      "sampling/sampling_logp_difference/max": 10.98397159576416,
+      "sampling/sampling_logp_difference/mean": 0.02019711770117283,
+      "step": 462
+    },
+    {
+      "clip_ratio/high_max": 2.874629831239872e-05,
+      "clip_ratio/high_mean": 1.0519701334033016e-05,
+      "clip_ratio/low_mean": 5.367962035052187e-05,
+      "clip_ratio/low_min": 6.5083827394119e-06,
+      "clip_ratio/region_mean": 6.419932219614566e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 7462.0546875,
+      "completions/mean_terminated_length": 6867.2587890625,
+      "completions/min_length": 669.0,
+      "completions/min_terminated_length": 669.0,
+      "entropy": 0.8141553401947021,
+      "epoch": 0.42594296228150874,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003602087963372469,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 407677177.0,
+      "reward": 0.421875,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999440312385559,
+      "sampling/importance_sampling_ratio/min": 0.0007806668290868402,
+      "sampling/sampling_logp_difference/max": 7.155362129211426,
+      "sampling/sampling_logp_difference/mean": 0.01856713369488716,
+      "step": 463
+    },
+    {
+      "clip_ratio/high_max": 2.6413443720230134e-05,
+      "clip_ratio/high_mean": 8.973188073468918e-06,
+      "clip_ratio/low_mean": 3.5997712757307454e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.497090230870526e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15750.0,
+      "completions/mean_length": 6683.1796875,
+      "completions/mean_terminated_length": 6529.19873046875,
+      "completions/min_length": 775.0,
+      "completions/min_terminated_length": 775.0,
+      "entropy": 0.9070071652531624,
+      "epoch": 0.42686292548298066,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004038481041789055,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 408552512.0,
+      "reward": 0.4609375,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000439882278442,
+      "sampling/importance_sampling_ratio/min": 4.474630986806005e-05,
+      "sampling/sampling_logp_difference/max": 10.014501571655273,
+      "sampling/sampling_logp_difference/mean": 0.02077356167137623,
+      "step": 464
+    },
+    {
+      "clip_ratio/high_max": 1.7171289982798044e-05,
+      "clip_ratio/high_mean": 4.292822495699511e-06,
+      "clip_ratio/low_mean": 3.225401701456576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654683996501262e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15864.0,
+      "completions/mean_length": 6472.9453125,
+      "completions/mean_terminated_length": 5985.51611328125,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8807859197258949,
+      "epoch": 0.42778288868445263,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004457853268831968,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 409399257.0,
+      "reward": 0.421875,
+      "reward_std": 0.20517179369926453,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 0.0017577135004103184,
+      "sampling/sampling_logp_difference/max": 6.343741416931152,
+      "sampling/sampling_logp_difference/mean": 0.020475786179304123,
+      "step": 465
+    },
+    {
+      "clip_ratio/high_max": 5.442162637336878e-05,
+      "clip_ratio/high_mean": 1.584139977239829e-05,
+      "clip_ratio/low_mean": 5.706528349946893e-05,
+      "clip_ratio/low_min": 2.5156462925224332e-05,
+      "clip_ratio/region_mean": 7.290668463610928e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15896.0,
+      "completions/mean_length": 5989.78125,
+      "completions/mean_terminated_length": 5654.48388671875,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.8479711338877678,
+      "epoch": 0.42870285188592455,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033953245729207993,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 410185645.0,
+      "reward": 0.5,
+      "reward_std": 0.3735082745552063,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999676942825317,
+      "sampling/importance_sampling_ratio/min": 1.781588616722729e-05,
+      "sampling/sampling_logp_difference/max": 10.935420036315918,
+      "sampling/sampling_logp_difference/mean": 0.017986344173550606,
+      "step": 466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.2673244681500364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2673244681500364e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 8299.9453125,
+      "completions/mean_terminated_length": 8171.62744140625,
+      "completions/min_length": 1123.0,
+      "completions/min_terminated_length": 1123.0,
+      "entropy": 0.9363152608275414,
+      "epoch": 0.4296228150873965,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002381247701123357,
+      "learning_rate": 1e-05,
+      "loss": 0.0651,
+      "num_tokens": 411268974.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 0.000553094083443284,
+      "sampling/sampling_logp_difference/max": 7.4999823570251465,
+      "sampling/sampling_logp_difference/mean": 0.021354343742132187,
+      "step": 467
+    },
+    {
+      "clip_ratio/high_max": 8.578695997130126e-06,
+      "clip_ratio/high_mean": 2.1446739992825314e-06,
+      "clip_ratio/low_mean": 2.84454882830687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.059016239603807e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14838.0,
+      "completions/mean_length": 7434.0546875,
+      "completions/mean_terminated_length": 7219.25634765625,
+      "completions/min_length": 898.0,
+      "completions/min_terminated_length": 898.0,
+      "entropy": 0.981913685798645,
+      "epoch": 0.43054277828886844,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006341467145830393,
+      "learning_rate": 1e-05,
+      "loss": -0.003,
+      "num_tokens": 412238117.0,
+      "reward": 0.390625,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 0.0019304680172353983,
+      "sampling/sampling_logp_difference/max": 6.249992847442627,
+      "sampling/sampling_logp_difference/mean": 0.02139873616397381,
+      "step": 468
+    },
+    {
+      "clip_ratio/high_max": 1.7187987396027893e-05,
+      "clip_ratio/high_mean": 5.150076049176278e-06,
+      "clip_ratio/low_mean": 5.4699471832009294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.9849548279089504e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15871.0,
+      "completions/mean_length": 7211.1796875,
+      "completions/mean_terminated_length": 7138.95263671875,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "entropy": 0.9307222217321396,
+      "epoch": 0.43146274149034036,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002621602965518832,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 413182860.0,
+      "reward": 0.3203125,
+      "reward_std": 0.34716784954071045,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999529123306274,
+      "sampling/importance_sampling_ratio/min": 5.1446182624204084e-05,
+      "sampling/sampling_logp_difference/max": 9.874974250793457,
+      "sampling/sampling_logp_difference/mean": 0.020250719040632248,
+      "step": 469
+    },
+    {
+      "clip_ratio/high_max": 1.0867412584047997e-05,
+      "clip_ratio/high_mean": 3.9217885614561965e-06,
+      "clip_ratio/low_mean": 4.7740833792886406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.16626223543426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15726.0,
+      "completions/mean_length": 5349.4296875,
+      "completions/mean_terminated_length": 5174.2783203125,
+      "completions/min_length": 983.0,
+      "completions/min_terminated_length": 983.0,
+      "entropy": 1.0213474333286285,
+      "epoch": 0.43238270469181234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035241330042481422,
+      "learning_rate": 1e-05,
+      "loss": 0.0657,
+      "num_tokens": 413885963.0,
+      "reward": 0.3046875,
+      "reward_std": 0.25330984592437744,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999449253082275,
+      "sampling/importance_sampling_ratio/min": 0.0003569081309251487,
+      "sampling/sampling_logp_difference/max": 7.938032150268555,
+      "sampling/sampling_logp_difference/mean": 0.01975759118795395,
+      "step": 470
+    },
+    {
+      "clip_ratio/high_max": 1.469514609198086e-05,
+      "clip_ratio/high_mean": 3.673786522995215e-06,
+      "clip_ratio/low_mean": 2.699725871480041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0671045237795624e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15357.0,
+      "completions/mean_length": 7542.8515625,
+      "completions/mean_terminated_length": 7257.65283203125,
+      "completions/min_length": 1359.0,
+      "completions/min_terminated_length": 1359.0,
+      "entropy": 0.8882969543337822,
+      "epoch": 0.43330266789328425,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014164346503093839,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 414870560.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20753081142902374,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000402927398682,
+      "sampling/importance_sampling_ratio/min": 6.435441900976002e-05,
+      "sampling/sampling_logp_difference/max": 9.651104927062988,
+      "sampling/sampling_logp_difference/mean": 0.020874422043561935,
+      "step": 471
+    },
+    {
+      "clip_ratio/high_max": 1.669827497607912e-05,
+      "clip_ratio/high_mean": 4.17456874401978e-06,
+      "clip_ratio/low_mean": 3.673103901746799e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.090560787517461e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7286.90625,
+      "completions/mean_terminated_length": 6993.451171875,
+      "completions/min_length": 977.0,
+      "completions/min_terminated_length": 977.0,
+      "entropy": 0.9254636988043785,
+      "epoch": 0.43422263109475623,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026956009678542614,
+      "learning_rate": 1e-05,
+      "loss": 0.0567,
+      "num_tokens": 415825252.0,
+      "reward": 0.328125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999917209148407,
+      "sampling/importance_sampling_ratio/min": 0.0019701423589140177,
+      "sampling/sampling_logp_difference/max": 6.229649543762207,
+      "sampling/sampling_logp_difference/mean": 0.0202642735093832,
+      "step": 472
+    },
+    {
+      "clip_ratio/high_max": 9.162045444099931e-06,
+      "clip_ratio/high_mean": 2.2905113610249828e-06,
+      "clip_ratio/low_mean": 3.818475033767754e-05,
+      "clip_ratio/low_min": 7.20606476534158e-06,
+      "clip_ratio/region_mean": 4.047526181238936e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15908.0,
+      "completions/mean_length": 7244.7421875,
+      "completions/mean_terminated_length": 6716.0244140625,
+      "completions/min_length": 1010.0,
+      "completions/min_terminated_length": 1010.0,
+      "entropy": 0.7817923128604889,
+      "epoch": 0.43514259429622815,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022128887940198183,
+      "learning_rate": 1e-05,
+      "loss": 0.0577,
+      "num_tokens": 416774011.0,
+      "reward": 0.453125,
+      "reward_std": 0.2937847375869751,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0015034435782581568,
+      "sampling/sampling_logp_difference/max": 6.499997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01840684749186039,
+      "step": 473
+    },
+    {
+      "clip_ratio/high_max": 1.2232871313244686e-05,
+      "clip_ratio/high_mean": 3.0582178283111716e-06,
+      "clip_ratio/low_mean": 3.636896872194484e-05,
+      "clip_ratio/low_min": 3.1460788250115e-06,
+      "clip_ratio/region_mean": 3.9427186266038916e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16254.0,
+      "completions/mean_length": 9042.90625,
+      "completions/mean_terminated_length": 8283.482421875,
+      "completions/min_length": 997.0,
+      "completions/min_terminated_length": 997.0,
+      "entropy": 0.9306210279464722,
+      "epoch": 0.43606255749770007,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034676652867347,
+      "learning_rate": 1e-05,
+      "loss": 0.0504,
+      "num_tokens": 417951311.0,
+      "reward": 0.265625,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999234080314636,
+      "sampling/importance_sampling_ratio/min": 0.0002641192404553294,
+      "sampling/sampling_logp_difference/max": 8.239109992980957,
+      "sampling/sampling_logp_difference/mean": 0.02112819254398346,
+      "step": 474
+    },
+    {
+      "clip_ratio/high_max": 2.5187824576278217e-05,
+      "clip_ratio/high_mean": 8.202394610634656e-06,
+      "clip_ratio/low_mean": 4.3606626604741905e-05,
+      "clip_ratio/low_min": 3.5752079838857753e-06,
+      "clip_ratio/region_mean": 5.1809020988002885e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15721.0,
+      "completions/mean_length": 6763.6328125,
+      "completions/mean_terminated_length": 6610.9287109375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9879302233457565,
+      "epoch": 0.43698252069917204,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030218157917261124,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 418836184.0,
+      "reward": 0.484375,
+      "reward_std": 0.30091896653175354,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 0.0003778560785576701,
+      "sampling/sampling_logp_difference/max": 7.880997180938721,
+      "sampling/sampling_logp_difference/mean": 0.021101050078868866,
+      "step": 475
+    },
+    {
+      "clip_ratio/high_max": 1.0644185749697499e-05,
+      "clip_ratio/high_mean": 2.6610464374243747e-06,
+      "clip_ratio/low_mean": 6.21261324340594e-05,
+      "clip_ratio/low_min": 3.6509140954876784e-06,
+      "clip_ratio/region_mean": 6.478717887148377e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15675.0,
+      "completions/mean_length": 6794.25,
+      "completions/mean_terminated_length": 6564.09619140625,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 1.0259138569235802,
+      "epoch": 0.43790248390064396,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002881827764213085,
+      "learning_rate": 1e-05,
+      "loss": 0.0592,
+      "num_tokens": 419726192.0,
+      "reward": 0.265625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999275207519531,
+      "sampling/importance_sampling_ratio/min": 9.217044407705544e-07,
+      "sampling/sampling_logp_difference/max": 13.897041320800781,
+      "sampling/sampling_logp_difference/mean": 0.0210823193192482,
+      "step": 476
+    },
+    {
+      "clip_ratio/high_max": 1.108860487875063e-05,
+      "clip_ratio/high_mean": 2.7721512196876574e-06,
+      "clip_ratio/low_mean": 4.70996876629215e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9871839337356505e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14281.0,
+      "completions/max_terminated_length": 14281.0,
+      "completions/mean_length": 5648.2109375,
+      "completions/mean_terminated_length": 5648.2109375,
+      "completions/min_length": 935.0,
+      "completions/min_terminated_length": 935.0,
+      "entropy": 0.88894472271204,
+      "epoch": 0.43882244710211593,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00289533962495625,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 420468867.0,
+      "reward": 0.484375,
+      "reward_std": 0.2675113081932068,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998449087142944,
+      "sampling/importance_sampling_ratio/min": 0.001372925122268498,
+      "sampling/sampling_logp_difference/max": 6.590811729431152,
+      "sampling/sampling_logp_difference/mean": 0.018499158322811127,
+      "step": 477
+    },
+    {
+      "clip_ratio/high_max": 4.753574557980755e-06,
+      "clip_ratio/high_mean": 1.1883936394951888e-06,
+      "clip_ratio/low_mean": 2.4103785335682915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5292179316238617e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15657.0,
+      "completions/mean_length": 6188.359375,
+      "completions/mean_terminated_length": 6026.52392578125,
+      "completions/min_length": 1085.0,
+      "completions/min_terminated_length": 1085.0,
+      "entropy": 0.8476063013076782,
+      "epoch": 0.43974241030358785,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.002749695209786296,
+      "learning_rate": 1e-05,
+      "loss": 0.0012,
+      "num_tokens": 421280881.0,
+      "reward": 0.3671875,
+      "reward_std": 0.15991678833961487,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796152114868,
+      "sampling/importance_sampling_ratio/min": 0.004578418098390102,
+      "sampling/sampling_logp_difference/max": 5.386401653289795,
+      "sampling/sampling_logp_difference/mean": 0.018456483259797096,
+      "step": 478
+    },
+    {
+      "clip_ratio/high_max": 4.1359915030625416e-05,
+      "clip_ratio/high_mean": 1.0339978757656354e-05,
+      "clip_ratio/low_mean": 4.786080125995795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8200780586048495e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 6864.3515625,
+      "completions/mean_terminated_length": 6635.88037109375,
+      "completions/min_length": 1065.0,
+      "completions/min_terminated_length": 1065.0,
+      "entropy": 0.8666203916072845,
+      "epoch": 0.4406623735050598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.005116373300552368,
+      "learning_rate": 1e-05,
+      "loss": 0.0347,
+      "num_tokens": 422177822.0,
+      "reward": 0.4453125,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 0.00020385721290949732,
+      "sampling/sampling_logp_difference/max": 8.498090744018555,
+      "sampling/sampling_logp_difference/mean": 0.01979806460440159,
+      "step": 479
+    },
+    {
+      "clip_ratio/high_max": 1.4544774558089557e-05,
+      "clip_ratio/high_mean": 3.6361936395223893e-06,
+      "clip_ratio/low_mean": 4.153812756158004e-05,
+      "clip_ratio/low_min": 3.606462769312202e-06,
+      "clip_ratio/region_mean": 4.51743208031985e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 7023.828125,
+      "completions/mean_terminated_length": 6799.18408203125,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9098334684967995,
+      "epoch": 0.44158233670653174,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0020944855641573668,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 423096576.0,
+      "reward": 0.2734375,
+      "reward_std": 0.20858672261238098,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999480247497559,
+      "sampling/importance_sampling_ratio/min": 0.0027383591514080763,
+      "sampling/sampling_logp_difference/max": 5.900396347045898,
+      "sampling/sampling_logp_difference/mean": 0.020111342892050743,
+      "step": 480
+    },
+    {
+      "clip_ratio/high_max": 3.256236095694476e-05,
+      "clip_ratio/high_mean": 1.2372795026749372e-05,
+      "clip_ratio/low_mean": 5.0774355258909054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.314715119515313e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15527.0,
+      "completions/mean_length": 6666.828125,
+      "completions/mean_terminated_length": 6512.587890625,
+      "completions/min_length": 872.0,
+      "completions/min_terminated_length": 872.0,
+      "entropy": 0.9162466824054718,
+      "epoch": 0.44250229990800366,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003897767048329115,
+      "learning_rate": 1e-05,
+      "loss": 0.1151,
+      "num_tokens": 423968050.0,
+      "reward": 0.46875,
+      "reward_std": 0.3527044653892517,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0031828521750867367,
+      "sampling/sampling_logp_difference/max": 5.7499775886535645,
+      "sampling/sampling_logp_difference/mean": 0.019923247396945953,
+      "step": 481
+    },
+    {
+      "clip_ratio/high_max": 1.5341902098953142e-05,
+      "clip_ratio/high_mean": 4.791600815678976e-06,
+      "clip_ratio/low_mean": 7.980174223121139e-05,
+      "clip_ratio/low_min": 2.6713308216130827e-05,
+      "clip_ratio/region_mean": 8.459334412691533e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16223.0,
+      "completions/mean_length": 7159.8046875,
+      "completions/mean_terminated_length": 7013.38916015625,
+      "completions/min_length": 1022.0,
+      "completions/min_terminated_length": 1022.0,
+      "entropy": 0.8444746807217598,
+      "epoch": 0.44342226310947563,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003038195427507162,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 424902953.0,
+      "reward": 0.359375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940037727356,
+      "sampling/importance_sampling_ratio/min": 7.431909580191132e-06,
+      "sampling/sampling_logp_difference/max": 11.809727668762207,
+      "sampling/sampling_logp_difference/mean": 0.019014043733477592,
+      "step": 482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.55851120666739e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.55851120666739e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14716.0,
+      "completions/mean_length": 6146.2109375,
+      "completions/mean_terminated_length": 6065.5986328125,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "entropy": 0.8365580290555954,
+      "epoch": 0.44434222631094755,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025550283025950193,
+      "learning_rate": 1e-05,
+      "loss": 0.0548,
+      "num_tokens": 425709212.0,
+      "reward": 0.5625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000015497207642,
+      "sampling/importance_sampling_ratio/min": 0.0006884043687023222,
+      "sampling/sampling_logp_difference/max": 7.281134128570557,
+      "sampling/sampling_logp_difference/mean": 0.019193854182958603,
+      "step": 483
+    },
+    {
+      "clip_ratio/high_max": 2.4752349872869672e-05,
+      "clip_ratio/high_mean": 7.036488455014478e-06,
+      "clip_ratio/low_mean": 4.780410063176532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.484058920046664e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16153.0,
+      "completions/mean_length": 6557.578125,
+      "completions/mean_terminated_length": 6321.744140625,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.8316832035779953,
+      "epoch": 0.4452621895124195,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005126865580677986,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 426566462.0,
+      "reward": 0.484375,
+      "reward_std": 0.27852246165275574,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 2.7536634661373682e-05,
+      "sampling/sampling_logp_difference/max": 10.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01839536987245083,
+      "step": 484
+    },
+    {
+      "clip_ratio/high_max": 3.443571449679439e-05,
+      "clip_ratio/high_mean": 8.608928624198597e-06,
+      "clip_ratio/low_mean": 5.915772453590762e-05,
+      "clip_ratio/low_min": 1.7084812043322017e-05,
+      "clip_ratio/region_mean": 6.776665304641938e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16359.0,
+      "completions/mean_length": 7007.3203125,
+      "completions/mean_terminated_length": 6858.484375,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8674142584204674,
+      "epoch": 0.44618215271389144,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004829525947570801,
+      "learning_rate": 1e-05,
+      "loss": 0.0753,
+      "num_tokens": 427480007.0,
+      "reward": 0.46875,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998922944068909,
+      "sampling/importance_sampling_ratio/min": 0.00020170137577224523,
+      "sampling/sampling_logp_difference/max": 8.508722305297852,
+      "sampling/sampling_logp_difference/mean": 0.019586069509387016,
+      "step": 485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.539863354897534e-05,
+      "clip_ratio/low_min": 8.211341992137022e-06,
+      "clip_ratio/region_mean": 5.539863354897534e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14748.0,
+      "completions/mean_length": 7069.8828125,
+      "completions/mean_terminated_length": 6922.0400390625,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.9066255167126656,
+      "epoch": 0.44710211591536336,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003539952216669917,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 428404968.0,
+      "reward": 0.5,
+      "reward_std": 0.3618982434272766,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 0.00024052867956925184,
+      "sampling/sampling_logp_difference/max": 8.332671165466309,
+      "sampling/sampling_logp_difference/mean": 0.020427238196134567,
+      "step": 486
+    },
+    {
+      "clip_ratio/high_max": 1.6550495729461545e-05,
+      "clip_ratio/high_mean": 4.137623932365386e-06,
+      "clip_ratio/low_mean": 5.576918465521885e-05,
+      "clip_ratio/low_min": 1.2613936178240692e-05,
+      "clip_ratio/region_mean": 5.99068093833921e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15290.0,
+      "completions/max_terminated_length": 15290.0,
+      "completions/mean_length": 5586.6875,
+      "completions/mean_terminated_length": 5586.6875,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.9208655655384064,
+      "epoch": 0.44802207911683534,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0030504625756293535,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 429137176.0,
+      "reward": 0.515625,
+      "reward_std": 0.3480040729045868,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999984502792358,
+      "sampling/importance_sampling_ratio/min": 0.0005498559912666678,
+      "sampling/sampling_logp_difference/max": 7.50585412979126,
+      "sampling/sampling_logp_difference/mean": 0.019396595656871796,
+      "step": 487
+    },
+    {
+      "clip_ratio/high_max": 3.3761509712348925e-05,
+      "clip_ratio/high_mean": 8.440377428087231e-06,
+      "clip_ratio/low_mean": 3.6384140912559815e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.482451868170756e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15404.0,
+      "completions/mean_length": 5266.265625,
+      "completions/mean_terminated_length": 4999.4404296875,
+      "completions/min_length": 492.0,
+      "completions/min_terminated_length": 492.0,
+      "entropy": 0.7884859293699265,
+      "epoch": 0.44894204231830726,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003902251599356532,
+      "learning_rate": 1e-05,
+      "loss": -0.0077,
+      "num_tokens": 429836026.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.05675617232918739,
+      "sampling/sampling_logp_difference/max": 2.868990898132324,
+      "sampling/sampling_logp_difference/mean": 0.01770034246146679,
+      "step": 488
+    },
+    {
+      "clip_ratio/high_max": 2.2323702978610527e-05,
+      "clip_ratio/high_mean": 5.580925744652632e-06,
+      "clip_ratio/low_mean": 4.0199149452746497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.578007497002545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6398.53125,
+      "completions/mean_terminated_length": 6319.9052734375,
+      "completions/min_length": 699.0,
+      "completions/min_terminated_length": 699.0,
+      "entropy": 0.8982341960072517,
+      "epoch": 0.44986200551977923,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024998660665005445,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 430673446.0,
+      "reward": 0.421875,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797940254211,
+      "sampling/importance_sampling_ratio/min": 0.000612784584518522,
+      "sampling/sampling_logp_difference/max": 7.397497177124023,
+      "sampling/sampling_logp_difference/mean": 0.020521972328424454,
+      "step": 489
+    },
+    {
+      "clip_ratio/high_max": 3.1756624366607866e-05,
+      "clip_ratio/high_mean": 7.939156091651967e-06,
+      "clip_ratio/low_mean": 8.124458963720826e-05,
+      "clip_ratio/low_min": 1.2379174222587608e-05,
+      "clip_ratio/region_mean": 8.91837471499457e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14374.0,
+      "completions/mean_length": 6277.65625,
+      "completions/mean_terminated_length": 6198.07861328125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8139145970344543,
+      "epoch": 0.45078196872125115,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00784115307033062,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 431497546.0,
+      "reward": 0.546875,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999848484992981,
+      "sampling/importance_sampling_ratio/min": 0.0006267798598855734,
+      "sampling/sampling_logp_difference/max": 7.37491512298584,
+      "sampling/sampling_logp_difference/mean": 0.01836184598505497,
+      "step": 490
+    },
+    {
+      "clip_ratio/high_max": 8.875004823494237e-06,
+      "clip_ratio/high_mean": 2.2187512058735592e-06,
+      "clip_ratio/low_mean": 2.3825880248296016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6044631454169576e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15903.0,
+      "completions/mean_length": 7708.59375,
+      "completions/mean_terminated_length": 7355.9345703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.087083138525486,
+      "epoch": 0.45170193192272307,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.004277343396097422,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 432503414.0,
+      "reward": 0.2890625,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999503493309021,
+      "sampling/importance_sampling_ratio/min": 1.2187546417408157e-05,
+      "sampling/sampling_logp_difference/max": 11.315095901489258,
+      "sampling/sampling_logp_difference/mean": 0.02224145457148552,
+      "step": 491
+    },
+    {
+      "clip_ratio/high_max": 6.384065272868611e-06,
+      "clip_ratio/high_mean": 1.5960163182171527e-06,
+      "clip_ratio/low_mean": 3.561227788395627e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.720829374742607e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7162.7109375,
+      "completions/mean_terminated_length": 6865.25,
+      "completions/min_length": 842.0,
+      "completions/min_terminated_length": 842.0,
+      "entropy": 0.9157010763883591,
+      "epoch": 0.45262189512419504,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006278311368077993,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 433439137.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2227931171655655,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966561794281,
+      "sampling/importance_sampling_ratio/min": 0.0005532125360332429,
+      "sampling/sampling_logp_difference/max": 7.499768257141113,
+      "sampling/sampling_logp_difference/mean": 0.02123419940471649,
+      "step": 492
+    },
+    {
+      "clip_ratio/high_max": 2.846911434062349e-05,
+      "clip_ratio/high_mean": 8.656040449750435e-06,
+      "clip_ratio/low_mean": 5.1716241614485625e-05,
+      "clip_ratio/low_min": 3.601579010137357e-06,
+      "clip_ratio/region_mean": 6.037228104105452e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16123.0,
+      "completions/mean_length": 7388.90625,
+      "completions/mean_terminated_length": 7023.251953125,
+      "completions/min_length": 980.0,
+      "completions/min_terminated_length": 980.0,
+      "entropy": 0.7670486867427826,
+      "epoch": 0.45354185832566696,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005177734419703484,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "num_tokens": 434402045.0,
+      "reward": 0.3828125,
+      "reward_std": 0.37951958179473877,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999250769615173,
+      "sampling/importance_sampling_ratio/min": 0.0022511729039251804,
+      "sampling/sampling_logp_difference/max": 6.096303939819336,
+      "sampling/sampling_logp_difference/mean": 0.01827731542289257,
+      "step": 493
+    },
+    {
+      "clip_ratio/high_max": 2.1548471977439476e-05,
+      "clip_ratio/high_mean": 6.257203722270788e-06,
+      "clip_ratio/low_mean": 7.719641234871233e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.345361538886209e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 6805.375,
+      "completions/mean_terminated_length": 6496.38671875,
+      "completions/min_length": 587.0,
+      "completions/min_terminated_length": 587.0,
+      "entropy": 0.8407405763864517,
+      "epoch": 0.45446182152713893,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032320048194378614,
+      "learning_rate": 1e-05,
+      "loss": 0.0662,
+      "num_tokens": 435292029.0,
+      "reward": 0.4296875,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999642372131348,
+      "sampling/importance_sampling_ratio/min": 6.679954094579443e-05,
+      "sampling/sampling_logp_difference/max": 9.613814353942871,
+      "sampling/sampling_logp_difference/mean": 0.018761277198791504,
+      "step": 494
+    },
+    {
+      "clip_ratio/high_max": 3.460495008766884e-06,
+      "clip_ratio/high_mean": 8.65123752191721e-07,
+      "clip_ratio/low_mean": 7.76378024056612e-05,
+      "clip_ratio/low_min": 1.7026316072588088e-05,
+      "clip_ratio/region_mean": 7.850292649891344e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15105.0,
+      "completions/mean_length": 5753.4140625,
+      "completions/mean_terminated_length": 5321.2763671875,
+      "completions/min_length": 946.0,
+      "completions/min_terminated_length": 946.0,
+      "entropy": 0.7848984077572823,
+      "epoch": 0.45538178472861085,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030854379292577505,
+      "learning_rate": 1e-05,
+      "loss": 0.0279,
+      "num_tokens": 436046842.0,
+      "reward": 0.578125,
+      "reward_std": 0.31405961513519287,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998626708984375,
+      "sampling/importance_sampling_ratio/min": 4.36544311810394e-09,
+      "sampling/sampling_logp_difference/max": 19.24954605102539,
+      "sampling/sampling_logp_difference/mean": 0.017733070999383926,
+      "step": 495
+    },
+    {
+      "clip_ratio/high_max": 1.7207588371093152e-05,
+      "clip_ratio/high_mean": 4.301897092773288e-06,
+      "clip_ratio/low_mean": 3.234025916754035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.664215591925313e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6522.84375,
+      "completions/mean_terminated_length": 6445.19677734375,
+      "completions/min_length": 1062.0,
+      "completions/min_terminated_length": 1062.0,
+      "entropy": 1.0593653172254562,
+      "epoch": 0.4563017479300828,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003124243812635541,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 436899638.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2706219553947449,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999418258666992,
+      "sampling/importance_sampling_ratio/min": 4.476920821616659e-06,
+      "sampling/sampling_logp_difference/max": 12.316575050354004,
+      "sampling/sampling_logp_difference/mean": 0.021180003881454468,
+      "step": 496
+    },
+    {
+      "clip_ratio/high_max": 1.1790433973146719e-05,
+      "clip_ratio/high_mean": 2.9476084932866797e-06,
+      "clip_ratio/low_mean": 2.8437304308681632e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.138491274512489e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14515.0,
+      "completions/mean_length": 6203.203125,
+      "completions/mean_terminated_length": 5874.7900390625,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.8152795508503914,
+      "epoch": 0.45722171113155474,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005001795012503862,
+      "learning_rate": 1e-05,
+      "loss": 0.0817,
+      "num_tokens": 437713008.0,
+      "reward": 0.4296875,
+      "reward_std": 0.26143795251846313,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101758003235,
+      "sampling/importance_sampling_ratio/min": 0.001757707679644227,
+      "sampling/sampling_logp_difference/max": 6.34374475479126,
+      "sampling/sampling_logp_difference/mean": 0.017751028761267662,
+      "step": 497
+    },
+    {
+      "clip_ratio/high_max": 1.3163793028070359e-05,
+      "clip_ratio/high_mean": 4.229499381835922e-06,
+      "clip_ratio/low_mean": 4.4599403963729856e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.882890357293945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15423.0,
+      "completions/mean_length": 5975.5234375,
+      "completions/mean_terminated_length": 5725.72021484375,
+      "completions/min_length": 690.0,
+      "completions/min_terminated_length": 690.0,
+      "entropy": 0.8275932744145393,
+      "epoch": 0.45814167433302666,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005084732081741095,
+      "learning_rate": 1e-05,
+      "loss": 0.0759,
+      "num_tokens": 438495811.0,
+      "reward": 0.5390625,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998699426651001,
+      "sampling/importance_sampling_ratio/min": 3.120788460364565e-05,
+      "sampling/sampling_logp_difference/max": 10.374839782714844,
+      "sampling/sampling_logp_difference/mean": 0.018671832978725433,
+      "step": 498
+    },
+    {
+      "clip_ratio/high_max": 3.229640242352616e-06,
+      "clip_ratio/high_mean": 8.07410060588154e-07,
+      "clip_ratio/low_mean": 3.0413870263146237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1221280551108066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 7019.59375,
+      "completions/mean_terminated_length": 7019.59375,
+      "completions/min_length": 1058.0,
+      "completions/min_terminated_length": 1058.0,
+      "entropy": 0.9266618490219116,
+      "epoch": 0.45906163753449863,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002567912917584181,
+      "learning_rate": 1e-05,
+      "loss": 0.0282,
+      "num_tokens": 439413055.0,
+      "reward": 0.375,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000476837158203,
+      "sampling/importance_sampling_ratio/min": 0.0010315657127648592,
+      "sampling/sampling_logp_difference/max": 6.876677513122559,
+      "sampling/sampling_logp_difference/mean": 0.02012534812092781,
+      "step": 499
+    },
+    {
+      "clip_ratio/high_max": 1.8327779343962902e-05,
+      "clip_ratio/high_mean": 4.5819448359907256e-06,
+      "clip_ratio/low_mean": 4.08189575864526e-05,
+      "clip_ratio/low_min": 4.041122338094283e-06,
+      "clip_ratio/region_mean": 4.5400901854009135e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 7373.3203125,
+      "completions/mean_terminated_length": 7082.65283203125,
+      "completions/min_length": 854.0,
+      "completions/min_terminated_length": 854.0,
+      "entropy": 0.9383682310581207,
+      "epoch": 0.45998160073597055,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004862098954617977,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 440375128.0,
+      "reward": 0.4375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.0006883886526338756,
+      "sampling/sampling_logp_difference/max": 7.28115701675415,
+      "sampling/sampling_logp_difference/mean": 0.020596595481038094,
+      "step": 500
+    },
+    {
+      "clip_ratio/high_max": 1.650619151405408e-05,
+      "clip_ratio/high_mean": 4.12654787851352e-06,
+      "clip_ratio/low_mean": 6.364750265674957e-05,
+      "clip_ratio/low_min": 3.94595599573222e-06,
+      "clip_ratio/region_mean": 6.77740499668289e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16280.0,
+      "completions/mean_length": 5944.953125,
+      "completions/mean_terminated_length": 5862.755859375,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "entropy": 0.9130716845393181,
+      "epoch": 0.4609015639374425,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041388699784875,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 441156306.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999566078186035,
+      "sampling/importance_sampling_ratio/min": 0.0007685241289436817,
+      "sampling/sampling_logp_difference/max": 7.171038627624512,
+      "sampling/sampling_logp_difference/mean": 0.019817989319562912,
+      "step": 501
+    },
+    {
+      "clip_ratio/high_max": 2.9951792839710834e-05,
+      "clip_ratio/high_mean": 9.205811807078135e-06,
+      "clip_ratio/low_mean": 3.147234815514821e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0678160075913183e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16181.0,
+      "completions/mean_length": 6686.015625,
+      "completions/mean_terminated_length": 6609.6533203125,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 0.8640913739800453,
+      "epoch": 0.46182152713891444,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005679543130099773,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 442032972.0,
+      "reward": 0.5546875,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 0.007731473073363304,
+      "sampling/sampling_logp_difference/max": 4.86245584487915,
+      "sampling/sampling_logp_difference/mean": 0.019738182425498962,
+      "step": 502
+    },
+    {
+      "clip_ratio/high_max": 3.0190597726686974e-05,
+      "clip_ratio/high_mean": 7.5476494316717435e-06,
+      "clip_ratio/low_mean": 3.858067566397949e-05,
+      "clip_ratio/low_min": 9.290916750614997e-06,
+      "clip_ratio/region_mean": 4.612832617567619e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 6945.5,
+      "completions/mean_terminated_length": 6231.6640625,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.8156519457697868,
+      "epoch": 0.46274149034038636,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006176612339913845,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 442940940.0,
+      "reward": 0.46875,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999117851257324,
+      "sampling/importance_sampling_ratio/min": 0.00018278000061400235,
+      "sampling/sampling_logp_difference/max": 8.607227325439453,
+      "sampling/sampling_logp_difference/mean": 0.01836501806974411,
+      "step": 503
+    },
+    {
+      "clip_ratio/high_max": 2.2105000425653998e-05,
+      "clip_ratio/high_mean": 6.28071654773521e-06,
+      "clip_ratio/low_mean": 3.060894187001395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6889658531436e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15847.0,
+      "completions/mean_length": 8068.5390625,
+      "completions/mean_terminated_length": 7363.8388671875,
+      "completions/min_length": 875.0,
+      "completions/min_terminated_length": 875.0,
+      "entropy": 0.8196670189499855,
+      "epoch": 0.46366145354185834,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021770994644612074,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 443992041.0,
+      "reward": 0.4453125,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999759197235107,
+      "sampling/importance_sampling_ratio/min": 0.0001795605494407937,
+      "sampling/sampling_logp_difference/max": 8.624998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019003838300704956,
+      "step": 504
+    },
+    {
+      "clip_ratio/high_max": 1.287241002501105e-05,
+      "clip_ratio/high_mean": 3.2181025062527624e-06,
+      "clip_ratio/low_mean": 4.5685408849749365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.89035115833758e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15168.0,
+      "completions/mean_length": 5209.140625,
+      "completions/mean_terminated_length": 5031.76220703125,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "entropy": 0.8851845487952232,
+      "epoch": 0.46458141674333026,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00788798462599516,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 444679675.0,
+      "reward": 0.4609375,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796748161316,
+      "sampling/importance_sampling_ratio/min": 0.00025673024356365204,
+      "sampling/sampling_logp_difference/max": 8.267484664916992,
+      "sampling/sampling_logp_difference/mean": 0.018808994442224503,
+      "step": 505
+    },
+    {
+      "clip_ratio/high_max": 2.294301202709903e-05,
+      "clip_ratio/high_mean": 6.590465602585027e-06,
+      "clip_ratio/low_mean": 5.944662643742049e-05,
+      "clip_ratio/low_min": 8.106994755507912e-06,
+      "clip_ratio/region_mean": 6.603709243790945e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16259.0,
+      "completions/mean_length": 7558.8984375,
+      "completions/mean_terminated_length": 7274.21728515625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.003449946641922,
+      "epoch": 0.46550137994480223,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004547314252704382,
+      "learning_rate": 1e-05,
+      "loss": 0.1586,
+      "num_tokens": 445668126.0,
+      "reward": 0.421875,
+      "reward_std": 0.42293959856033325,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999848484992981,
+      "sampling/importance_sampling_ratio/min": 0.00011622780584730208,
+      "sampling/sampling_logp_difference/max": 9.059958457946777,
+      "sampling/sampling_logp_difference/mean": 0.02099413052201271,
+      "step": 506
+    },
+    {
+      "clip_ratio/high_max": 2.1350435872591333e-05,
+      "clip_ratio/high_mean": 6.047981628398702e-06,
+      "clip_ratio/low_mean": 8.880347786544007e-05,
+      "clip_ratio/low_min": 9.06585455595632e-06,
+      "clip_ratio/region_mean": 9.485145938015194e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16137.0,
+      "completions/max_terminated_length": 16137.0,
+      "completions/mean_length": 6066.6015625,
+      "completions/mean_terminated_length": 6066.6015625,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "entropy": 0.8450648710131645,
+      "epoch": 0.46642134314627415,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004621773958206177,
+      "learning_rate": 1e-05,
+      "loss": 0.121,
+      "num_tokens": 446464587.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000154972076416,
+      "sampling/importance_sampling_ratio/min": 1.3950601896794979e-05,
+      "sampling/sampling_logp_difference/max": 11.179987907409668,
+      "sampling/sampling_logp_difference/mean": 0.018016980960965157,
+      "step": 507
+    },
+    {
+      "clip_ratio/high_max": 3.0534724828612525e-06,
+      "clip_ratio/high_mean": 7.633681207153131e-07,
+      "clip_ratio/low_mean": 2.149350007130124e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2256868305703392e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 6988.0234375,
+      "completions/mean_terminated_length": 6838.88134765625,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 1.0452716201543808,
+      "epoch": 0.46734130634774607,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004523546434938908,
+      "learning_rate": 1e-05,
+      "loss": 0.0396,
+      "num_tokens": 447381134.0,
+      "reward": 0.3515625,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999901056289673,
+      "sampling/importance_sampling_ratio/min": 0.016167031601071358,
+      "sampling/sampling_logp_difference/max": 4.124781131744385,
+      "sampling/sampling_logp_difference/mean": 0.021812722086906433,
+      "step": 508
+    },
+    {
+      "clip_ratio/high_max": 5.58759120394825e-06,
+      "clip_ratio/high_mean": 1.3968978009870625e-06,
+      "clip_ratio/low_mean": 3.684896307731833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.824586099199223e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12316.0,
+      "completions/max_terminated_length": 12316.0,
+      "completions/mean_length": 5948.5,
+      "completions/mean_terminated_length": 5948.5,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8241566568613052,
+      "epoch": 0.46826126954921804,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004002885892987251,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 448158014.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 0.0008566387114115059,
+      "sampling/sampling_logp_difference/max": 7.062494277954102,
+      "sampling/sampling_logp_difference/mean": 0.018487900495529175,
+      "step": 509
+    },
+    {
+      "clip_ratio/high_max": 1.0490723752809572e-05,
+      "clip_ratio/high_mean": 3.439610338773491e-06,
+      "clip_ratio/low_mean": 3.973086239739132e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3170473020381905e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16044.0,
+      "completions/mean_length": 7966.375,
+      "completions/mean_terminated_length": 7764.3525390625,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.8868448063731194,
+      "epoch": 0.46918123275068996,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019062751671299338,
+      "learning_rate": 1e-05,
+      "loss": 0.0787,
+      "num_tokens": 449197054.0,
+      "reward": 0.40625,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0001614262000657618,
+      "sampling/sampling_logp_difference/max": 8.731462478637695,
+      "sampling/sampling_logp_difference/mean": 0.020015282556414604,
+      "step": 510
+    },
+    {
+      "clip_ratio/high_max": 1.2195105682621943e-05,
+      "clip_ratio/high_mean": 3.0487764206554857e-06,
+      "clip_ratio/low_mean": 3.558348203114292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8632259474979946e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16124.0,
+      "completions/mean_length": 6520.0234375,
+      "completions/mean_terminated_length": 6442.3544921875,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9168323278427124,
+      "epoch": 0.47010119595216193,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00490277074277401,
+      "learning_rate": 1e-05,
+      "loss": 0.0547,
+      "num_tokens": 450050153.0,
+      "reward": 0.484375,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 4.4418397919798736e-06,
+      "sampling/sampling_logp_difference/max": 12.324441909790039,
+      "sampling/sampling_logp_difference/mean": 0.020178331062197685,
+      "step": 511
+    },
+    {
+      "clip_ratio/high_max": 7.95772848505294e-06,
+      "clip_ratio/high_mean": 1.989432121263235e-06,
+      "clip_ratio/low_mean": 3.363800146871654e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.562743381735345e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 6614.5625,
+      "completions/mean_terminated_length": 6217.4306640625,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.8635925352573395,
+      "epoch": 0.47102115915363385,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003792276605963707,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 450915281.0,
+      "reward": 0.5,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999154806137085,
+      "sampling/importance_sampling_ratio/min": 0.004489119164645672,
+      "sampling/sampling_logp_difference/max": 5.40609884262085,
+      "sampling/sampling_logp_difference/mean": 0.019233014434576035,
+      "step": 512
+    },
+    {
+      "clip_ratio/high_max": 1.6306271390931215e-05,
+      "clip_ratio/high_mean": 6.67555605105008e-06,
+      "clip_ratio/low_mean": 3.4846169796765025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1521726302562456e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16206.0,
+      "completions/mean_length": 6458.5078125,
+      "completions/mean_terminated_length": 5970.36865234375,
+      "completions/min_length": 1025.0,
+      "completions/min_terminated_length": 1025.0,
+      "entropy": 0.8816124573349953,
+      "epoch": 0.47194112235510577,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031763892620801926,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 451761322.0,
+      "reward": 0.4921875,
+      "reward_std": 0.282474160194397,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999036192893982,
+      "sampling/importance_sampling_ratio/min": 9.611394489184022e-05,
+      "sampling/sampling_logp_difference/max": 9.24997615814209,
+      "sampling/sampling_logp_difference/mean": 0.01935420371592045,
+      "step": 513
+    },
+    {
+      "clip_ratio/high_max": 7.861634912842419e-06,
+      "clip_ratio/high_mean": 3.0314158721012063e-06,
+      "clip_ratio/low_mean": 2.2518463538290234e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.554987941039144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15928.0,
+      "completions/mean_length": 5844.03125,
+      "completions/mean_terminated_length": 5676.73046875,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "entropy": 0.9008020162582397,
+      "epoch": 0.47286108555657774,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004134794697165489,
+      "learning_rate": 1e-05,
+      "loss": 0.1094,
+      "num_tokens": 452526342.0,
+      "reward": 0.546875,
+      "reward_std": 0.28930899500846863,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999297857284546,
+      "sampling/importance_sampling_ratio/min": 0.00012955136480741203,
+      "sampling/sampling_logp_difference/max": 8.951433181762695,
+      "sampling/sampling_logp_difference/mean": 0.02013866975903511,
+      "step": 514
+    },
+    {
+      "clip_ratio/high_max": 1.2711160707112867e-05,
+      "clip_ratio/high_mean": 3.177790176778217e-06,
+      "clip_ratio/low_mean": 2.444096298859222e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.761875293799676e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 6214.5859375,
+      "completions/mean_terminated_length": 6134.51171875,
+      "completions/min_length": 1096.0,
+      "completions/min_terminated_length": 1096.0,
+      "entropy": 0.9522949978709221,
+      "epoch": 0.47378104875804966,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022520655766129494,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 453343385.0,
+      "reward": 0.4921875,
+      "reward_std": 0.20623260736465454,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999879598617554,
+      "sampling/importance_sampling_ratio/min": 3.763851054827683e-05,
+      "sampling/sampling_logp_difference/max": 10.187482833862305,
+      "sampling/sampling_logp_difference/mean": 0.019947605207562447,
+      "step": 515
+    },
+    {
+      "clip_ratio/high_max": 5.724247012039996e-05,
+      "clip_ratio/high_mean": 1.431061753009999e-05,
+      "clip_ratio/low_mean": 3.371703428456385e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8027652155724354e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14376.0,
+      "completions/mean_length": 7138.515625,
+      "completions/mean_terminated_length": 7065.71630859375,
+      "completions/min_length": 846.0,
+      "completions/min_terminated_length": 846.0,
+      "entropy": 0.8856206461787224,
+      "epoch": 0.47470101195952163,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004887089133262634,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 454275379.0,
+      "reward": 0.4609375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999544620513916,
+      "sampling/importance_sampling_ratio/min": 0.004931141622364521,
+      "sampling/sampling_logp_difference/max": 5.312184810638428,
+      "sampling/sampling_logp_difference/mean": 0.019449077546596527,
+      "step": 516
+    },
+    {
+      "clip_ratio/high_max": 1.5607688055752078e-05,
+      "clip_ratio/high_mean": 3.9019220139380195e-06,
+      "clip_ratio/low_mean": 4.936055870530254e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.326248106030107e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15855.0,
+      "completions/mean_length": 6077.796875,
+      "completions/mean_terminated_length": 5915.00830078125,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.862022191286087,
+      "epoch": 0.47562097516099355,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003875041613355279,
+      "learning_rate": 1e-05,
+      "loss": 0.0366,
+      "num_tokens": 455076625.0,
+      "reward": 0.4921875,
+      "reward_std": 0.23933593928813934,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000392198562622,
+      "sampling/importance_sampling_ratio/min": 3.322543852846138e-05,
+      "sampling/sampling_logp_difference/max": 10.31219482421875,
+      "sampling/sampling_logp_difference/mean": 0.018907926976680756,
+      "step": 517
+    },
+    {
+      "clip_ratio/high_max": 1.0557040241110371e-05,
+      "clip_ratio/high_mean": 3.535163386914064e-06,
+      "clip_ratio/low_mean": 3.7409978290270374e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0945141790871276e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 6211.65625,
+      "completions/mean_terminated_length": 6211.65625,
+      "completions/min_length": 1292.0,
+      "completions/min_terminated_length": 1292.0,
+      "entropy": 0.8835236355662346,
+      "epoch": 0.4765409383624655,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004288897849619389,
+      "learning_rate": 1e-05,
+      "loss": 0.0822,
+      "num_tokens": 455889693.0,
+      "reward": 0.53125,
+      "reward_std": 0.27145031094551086,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999270439147949,
+      "sampling/importance_sampling_ratio/min": 2.5614745027269237e-06,
+      "sampling/sampling_logp_difference/max": 12.874927520751953,
+      "sampling/sampling_logp_difference/mean": 0.01986120268702507,
+      "step": 518
+    },
+    {
+      "clip_ratio/high_max": 2.842265530489385e-06,
+      "clip_ratio/high_mean": 7.105663826223463e-07,
+      "clip_ratio/low_mean": 3.578249538804812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.649306199804414e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16128.0,
+      "completions/mean_length": 7035.609375,
+      "completions/mean_terminated_length": 6962.0,
+      "completions/min_length": 762.0,
+      "completions/min_terminated_length": 762.0,
+      "entropy": 0.9033957049250603,
+      "epoch": 0.47746090156393745,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004230308346450329,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 456809643.0,
+      "reward": 0.3203125,
+      "reward_std": 0.17282497882843018,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999722242355347,
+      "sampling/importance_sampling_ratio/min": 1.670435995038133e-05,
+      "sampling/sampling_logp_difference/max": 10.99984073638916,
+      "sampling/sampling_logp_difference/mean": 0.020262110978364944,
+      "step": 519
+    },
+    {
+      "clip_ratio/high_max": 3.539844283295679e-05,
+      "clip_ratio/high_mean": 9.844010264714598e-06,
+      "clip_ratio/low_mean": 2.8534720058814855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.837873060774655e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16241.0,
+      "completions/mean_length": 6557.40625,
+      "completions/mean_terminated_length": 6321.568359375,
+      "completions/min_length": 1136.0,
+      "completions/min_terminated_length": 1136.0,
+      "entropy": 0.8352414071559906,
+      "epoch": 0.47838086476540936,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029154124204069376,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 457669431.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 5.8480534789850935e-05,
+      "sampling/sampling_logp_difference/max": 9.746816635131836,
+      "sampling/sampling_logp_difference/mean": 0.019474683329463005,
+      "step": 520
+    },
+    {
+      "clip_ratio/high_max": 6.400114170901361e-05,
+      "clip_ratio/high_mean": 1.917558859076962e-05,
+      "clip_ratio/low_mean": 5.166920755073079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.084479466357152e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15428.0,
+      "completions/mean_length": 6444.1328125,
+      "completions/mean_terminated_length": 6205.576171875,
+      "completions/min_length": 398.0,
+      "completions/min_terminated_length": 398.0,
+      "entropy": 0.7480100840330124,
+      "epoch": 0.47930082796688134,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025195449125021696,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 458512648.0,
+      "reward": 0.515625,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999996542930603,
+      "sampling/importance_sampling_ratio/min": 2.4302940801135264e-05,
+      "sampling/sampling_logp_difference/max": 10.624913215637207,
+      "sampling/sampling_logp_difference/mean": 0.01779567077755928,
+      "step": 521
+    },
+    {
+      "clip_ratio/high_max": 2.748944325503544e-06,
+      "clip_ratio/high_mean": 6.87236081375886e-07,
+      "clip_ratio/low_mean": 3.4855478702411347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5542715181691165e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15868.0,
+      "completions/mean_length": 6615.234375,
+      "completions/mean_terminated_length": 6380.7841796875,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "entropy": 0.8428665772080421,
+      "epoch": 0.48022079116835326,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004339073318988085,
+      "learning_rate": 1e-05,
+      "loss": 0.0608,
+      "num_tokens": 459377790.0,
+      "reward": 0.5234375,
+      "reward_std": 0.31064465641975403,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999370574951172,
+      "sampling/importance_sampling_ratio/min": 0.00042492515058256686,
+      "sampling/sampling_logp_difference/max": 7.76359748840332,
+      "sampling/sampling_logp_difference/mean": 0.018815383315086365,
+      "step": 522
+    },
+    {
+      "clip_ratio/high_max": 2.2513844896820956e-05,
+      "clip_ratio/high_mean": 7.496596083456097e-06,
+      "clip_ratio/low_mean": 2.2591082483813807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0087678169365972e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15239.0,
+      "completions/mean_length": 6200.3203125,
+      "completions/mean_terminated_length": 5955.912109375,
+      "completions/min_length": 1032.0,
+      "completions/min_terminated_length": 1032.0,
+      "entropy": 0.9044734612107277,
+      "epoch": 0.48114075436982523,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005003004334867001,
+      "learning_rate": 1e-05,
+      "loss": 0.0502,
+      "num_tokens": 460189823.0,
+      "reward": 0.484375,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999645948410034,
+      "sampling/importance_sampling_ratio/min": 0.005019097588956356,
+      "sampling/sampling_logp_difference/max": 5.2945051193237305,
+      "sampling/sampling_logp_difference/mean": 0.0192951001226902,
+      "step": 523
+    },
+    {
+      "clip_ratio/high_max": 1.9086801785306307e-05,
+      "clip_ratio/high_mean": 4.771700446326577e-06,
+      "clip_ratio/low_mean": 3.145246773783583e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.622416772941506e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15706.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 5758.9140625,
+      "completions/mean_terminated_length": 5758.9140625,
+      "completions/min_length": 1181.0,
+      "completions/min_terminated_length": 1181.0,
+      "entropy": 0.8783154934644699,
+      "epoch": 0.48206071757129715,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005491400603204966,
+      "learning_rate": 1e-05,
+      "loss": 0.0209,
+      "num_tokens": 460944164.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2330428510904312,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.003907227888703346,
+      "sampling/sampling_logp_difference/max": 5.54492712020874,
+      "sampling/sampling_logp_difference/mean": 0.019315458834171295,
+      "step": 524
+    },
+    {
+      "clip_ratio/high_max": 1.5554858691757545e-05,
+      "clip_ratio/high_mean": 3.888714672939386e-06,
+      "clip_ratio/low_mean": 9.616303373150004e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.3505018273463065e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15536.0,
+      "completions/mean_length": 7573.375,
+      "completions/mean_terminated_length": 7504.0,
+      "completions/min_length": 1579.0,
+      "completions/min_terminated_length": 1579.0,
+      "entropy": 1.057753436267376,
+      "epoch": 0.48298068077276907,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0038622859865427017,
+      "learning_rate": 1e-05,
+      "loss": 0.0103,
+      "num_tokens": 461931916.0,
+      "reward": 0.3125,
+      "reward_std": 0.14123955368995667,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.002133321948349476,
+      "sampling/sampling_logp_difference/max": 6.1500749588012695,
+      "sampling/sampling_logp_difference/mean": 0.02145528793334961,
+      "step": 525
+    },
+    {
+      "clip_ratio/high_max": 2.2185531634022482e-05,
+      "clip_ratio/high_mean": 6.324094329102081e-06,
+      "clip_ratio/low_mean": 4.7102344296945375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.342643908079481e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14553.0,
+      "completions/mean_length": 7353.0703125,
+      "completions/mean_terminated_length": 7136.328125,
+      "completions/min_length": 907.0,
+      "completions/min_terminated_length": 907.0,
+      "entropy": 0.9386680871248245,
+      "epoch": 0.48390064397424104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002902502194046974,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 462894701.0,
+      "reward": 0.5234375,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999150037765503,
+      "sampling/importance_sampling_ratio/min": 0.00492977537214756,
+      "sampling/sampling_logp_difference/max": 5.312461853027344,
+      "sampling/sampling_logp_difference/mean": 0.021296534687280655,
+      "step": 526
+    },
+    {
+      "clip_ratio/high_max": 1.8664793969946913e-05,
+      "clip_ratio/high_mean": 4.666198492486728e-06,
+      "clip_ratio/low_mean": 5.111583186589996e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.578203035838669e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15851.0,
+      "completions/mean_length": 7280.953125,
+      "completions/mean_terminated_length": 6987.30615234375,
+      "completions/min_length": 1111.0,
+      "completions/min_terminated_length": 1111.0,
+      "entropy": 0.9424067437648773,
+      "epoch": 0.48482060717571296,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002602500608190894,
+      "learning_rate": 1e-05,
+      "loss": 0.0546,
+      "num_tokens": 463849087.0,
+      "reward": 0.3125,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999302625656128,
+      "sampling/importance_sampling_ratio/min": 4.007156167062931e-05,
+      "sampling/sampling_logp_difference/max": 10.12484359741211,
+      "sampling/sampling_logp_difference/mean": 0.020630592480301857,
+      "step": 527
+    },
+    {
+      "clip_ratio/high_max": 3.77411461158772e-05,
+      "clip_ratio/high_mean": 1.0150766001970624e-05,
+      "clip_ratio/low_mean": 4.5688502041230095e-05,
+      "clip_ratio/low_min": 5.72383623875794e-06,
+      "clip_ratio/region_mean": 5.583926849794807e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14628.0,
+      "completions/max_terminated_length": 14628.0,
+      "completions/mean_length": 6520.6328125,
+      "completions/mean_terminated_length": 6520.6328125,
+      "completions/min_length": 1459.0,
+      "completions/min_terminated_length": 1459.0,
+      "entropy": 0.8501213267445564,
+      "epoch": 0.48574057037718493,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005743890535086393,
+      "learning_rate": 1e-05,
+      "loss": 0.1494,
+      "num_tokens": 464704336.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3413938879966736,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999988079071045,
+      "sampling/importance_sampling_ratio/min": 5.838880315423012e-05,
+      "sampling/sampling_logp_difference/max": 9.74838638305664,
+      "sampling/sampling_logp_difference/mean": 0.018370801582932472,
+      "step": 528
+    },
+    {
+      "clip_ratio/high_max": 9.150254300038796e-06,
+      "clip_ratio/high_mean": 2.287563575009699e-06,
+      "clip_ratio/low_mean": 2.1804387529300584e-05,
+      "clip_ratio/low_min": 3.918126822100021e-06,
+      "clip_ratio/region_mean": 2.4091951559057634e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14675.0,
+      "completions/max_terminated_length": 14675.0,
+      "completions/mean_length": 7111.0,
+      "completions/mean_terminated_length": 7111.0,
+      "completions/min_length": 1288.0,
+      "completions/min_terminated_length": 1288.0,
+      "entropy": 0.8829544633626938,
+      "epoch": 0.48666053357865685,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004826955031603575,
+      "learning_rate": 1e-05,
+      "loss": 0.0967,
+      "num_tokens": 465632152.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2975040376186371,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999524354934692,
+      "sampling/importance_sampling_ratio/min": 0.00011604782775975764,
+      "sampling/sampling_logp_difference/max": 9.061508178710938,
+      "sampling/sampling_logp_difference/mean": 0.019976403564214706,
+      "step": 529
+    },
+    {
+      "clip_ratio/high_max": 2.3185014015325578e-05,
+      "clip_ratio/high_mean": 7.603994390592561e-06,
+      "clip_ratio/low_mean": 4.392900382299558e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.153299889570917e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15132.0,
+      "completions/mean_length": 7797.7109375,
+      "completions/mean_terminated_length": 7448.67431640625,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.9747610911726952,
+      "epoch": 0.48758049678012877,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028944616205990314,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 466648507.0,
+      "reward": 0.390625,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0002612585376482457,
+      "sampling/sampling_logp_difference/max": 8.25,
+      "sampling/sampling_logp_difference/mean": 0.020830729976296425,
+      "step": 530
+    },
+    {
+      "clip_ratio/high_max": 1.4947459476388758e-05,
+      "clip_ratio/high_mean": 3.7368648690971895e-06,
+      "clip_ratio/low_mean": 4.282657914700394e-05,
+      "clip_ratio/low_min": 4.545454430626705e-06,
+      "clip_ratio/region_mean": 4.656344435716164e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6395.4765625,
+      "completions/mean_terminated_length": 6316.82666015625,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "entropy": 0.9015842452645302,
+      "epoch": 0.48850045998160074,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003612271510064602,
+      "learning_rate": 1e-05,
+      "loss": 0.0573,
+      "num_tokens": 467487976.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998596906661987,
+      "sampling/importance_sampling_ratio/min": 1.209868287332938e-06,
+      "sampling/sampling_logp_difference/max": 13.624999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01959329843521118,
+      "step": 531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.8946868863167765e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8946868863167765e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 7298.78125,
+      "completions/mean_terminated_length": 7154.57177734375,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9978953301906586,
+      "epoch": 0.48942042318307266,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002104024635627866,
+      "learning_rate": 1e-05,
+      "loss": 0.0104,
+      "num_tokens": 468445132.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519513130188,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999783039093018,
+      "sampling/importance_sampling_ratio/min": 5.157754640094936e-05,
+      "sampling/sampling_logp_difference/max": 9.872424125671387,
+      "sampling/sampling_logp_difference/mean": 0.021517785266041756,
+      "step": 532
+    },
+    {
+      "clip_ratio/high_max": 2.0034196040796814e-05,
+      "clip_ratio/high_mean": 6.441706659643387e-06,
+      "clip_ratio/low_mean": 3.0451521752183908e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.689322829814046e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16003.0,
+      "completions/mean_length": 7021.53125,
+      "completions/mean_terminated_length": 6561.08154296875,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.9539581760764122,
+      "epoch": 0.49034038638454464,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0009346248698420823,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 469360760.0,
+      "reward": 0.375,
+      "reward_std": 0.20069600641727448,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.0029978419188410044,
+      "sampling/sampling_logp_difference/max": 5.8098626136779785,
+      "sampling/sampling_logp_difference/mean": 0.020538944751024246,
+      "step": 533
+    },
+    {
+      "clip_ratio/high_max": 7.874939228713629e-06,
+      "clip_ratio/high_mean": 1.968734807178407e-06,
+      "clip_ratio/low_mean": 3.2224923302237585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.419365827994625e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15370.0,
+      "completions/max_terminated_length": 15370.0,
+      "completions/mean_length": 6988.2109375,
+      "completions/mean_terminated_length": 6988.2109375,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.9471191540360451,
+      "epoch": 0.49126034958601655,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002331435214728117,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 470274859.0,
+      "reward": 0.3203125,
+      "reward_std": 0.23751860857009888,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002145767212,
+      "sampling/importance_sampling_ratio/min": 0.0015642779180780053,
+      "sampling/sampling_logp_difference/max": 6.460330963134766,
+      "sampling/sampling_logp_difference/mean": 0.02088295854628086,
+      "step": 534
+    },
+    {
+      "clip_ratio/high_max": 1.2364610256554442e-05,
+      "clip_ratio/high_mean": 3.0911525641386106e-06,
+      "clip_ratio/low_mean": 3.8229277151913266e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.132042954552162e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16212.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 7557.453125,
+      "completions/mean_terminated_length": 7557.453125,
+      "completions/min_length": 1064.0,
+      "completions/min_terminated_length": 1064.0,
+      "entropy": 0.9897207245230675,
+      "epoch": 0.4921803127874885,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004562230780720711,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 471263997.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.0001586318830959499,
+      "sampling/sampling_logp_difference/max": 8.748924255371094,
+      "sampling/sampling_logp_difference/mean": 0.02160259149968624,
+      "step": 535
+    },
+    {
+      "clip_ratio/high_max": 2.6050724500237266e-05,
+      "clip_ratio/high_mean": 7.420082738462952e-06,
+      "clip_ratio/low_mean": 5.8747830053107464e-05,
+      "clip_ratio/low_min": 1.3906133062846493e-05,
+      "clip_ratio/region_mean": 6.616791324631777e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15603.0,
+      "completions/mean_length": 6532.1953125,
+      "completions/mean_terminated_length": 6295.75244140625,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.9109068289399147,
+      "epoch": 0.49310027598896045,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004525062162429094,
+      "learning_rate": 1e-05,
+      "loss": 0.0219,
+      "num_tokens": 472120622.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999650120735168,
+      "sampling/importance_sampling_ratio/min": 1.474883083574241e-05,
+      "sampling/sampling_logp_difference/max": 11.124346733093262,
+      "sampling/sampling_logp_difference/mean": 0.019527796655893326,
+      "step": 536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.90738064766083e-05,
+      "clip_ratio/low_min": 1.0626089533616323e-05,
+      "clip_ratio/region_mean": 3.90738064766083e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15011.0,
+      "completions/mean_length": 5994.40625,
+      "completions/mean_terminated_length": 5912.5986328125,
+      "completions/min_length": 531.0,
+      "completions/min_terminated_length": 531.0,
+      "entropy": 0.9276224821805954,
+      "epoch": 0.49402023919043236,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005058468785136938,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 472906346.0,
+      "reward": 0.421875,
+      "reward_std": 0.19044627249240875,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 0.0005196271813474596,
+      "sampling/sampling_logp_difference/max": 7.562398910522461,
+      "sampling/sampling_logp_difference/mean": 0.020568232983350754,
+      "step": 537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.992188062009518e-05,
+      "clip_ratio/low_min": 1.2131874427723233e-05,
+      "clip_ratio/region_mean": 5.992188062009518e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15992.0,
+      "completions/mean_length": 6469.046875,
+      "completions/mean_terminated_length": 6311.6669921875,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.9536962807178497,
+      "epoch": 0.49494020239190434,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007286665495485067,
+      "learning_rate": 1e-05,
+      "loss": 0.1282,
+      "num_tokens": 473756256.0,
+      "reward": 0.3515625,
+      "reward_std": 0.35772189497947693,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000038146972656,
+      "sampling/importance_sampling_ratio/min": 6.244324322324246e-05,
+      "sampling/sampling_logp_difference/max": 9.681252479553223,
+      "sampling/sampling_logp_difference/mean": 0.019624462351202965,
+      "step": 538
+    },
+    {
+      "clip_ratio/high_max": 1.0018506145570427e-05,
+      "clip_ratio/high_mean": 2.504626536392607e-06,
+      "clip_ratio/low_mean": 3.329443018174061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.57990563770727e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15383.0,
+      "completions/max_terminated_length": 15383.0,
+      "completions/mean_length": 5778.703125,
+      "completions/mean_terminated_length": 5778.703125,
+      "completions/min_length": 903.0,
+      "completions/min_terminated_length": 903.0,
+      "entropy": 0.9274095296859741,
+      "epoch": 0.49586016559337626,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031439310405403376,
+      "learning_rate": 1e-05,
+      "loss": -0.0091,
+      "num_tokens": 474515194.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000576972961426,
+      "sampling/importance_sampling_ratio/min": 0.0006267410353757441,
+      "sampling/sampling_logp_difference/max": 7.374977111816406,
+      "sampling/sampling_logp_difference/mean": 0.019796252250671387,
+      "step": 539
+    },
+    {
+      "clip_ratio/high_max": 3.1761268928676145e-05,
+      "clip_ratio/high_mean": 9.23904565297562e-06,
+      "clip_ratio/low_mean": 4.140612338687788e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.064516949460085e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16146.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 6400.75,
+      "completions/mean_terminated_length": 6400.75,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 0.8927748426795006,
+      "epoch": 0.49678012879484823,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0039032045751810074,
+      "learning_rate": 1e-05,
+      "loss": 0.0938,
+      "num_tokens": 475355186.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3135277032852173,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880194664001,
+      "sampling/importance_sampling_ratio/min": 4.19893694925122e-06,
+      "sampling/sampling_logp_difference/max": 12.3806791305542,
+      "sampling/sampling_logp_difference/mean": 0.019878748804330826,
+      "step": 540
+    },
+    {
+      "clip_ratio/high_max": 2.524126966818585e-05,
+      "clip_ratio/high_mean": 7.227385253827379e-06,
+      "clip_ratio/low_mean": 5.609390495919797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.332129100883321e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14347.0,
+      "completions/mean_length": 7150.234375,
+      "completions/mean_terminated_length": 6928.62451171875,
+      "completions/min_length": 1548.0,
+      "completions/min_terminated_length": 1548.0,
+      "entropy": 0.8632503524422646,
+      "epoch": 0.49770009199632015,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004979084711521864,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 476289752.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3369181156158447,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991059303284,
+      "sampling/importance_sampling_ratio/min": 0.0004304716712795198,
+      "sampling/sampling_logp_difference/max": 7.75062894821167,
+      "sampling/sampling_logp_difference/mean": 0.019658904522657394,
+      "step": 541
+    },
+    {
+      "clip_ratio/high_max": 2.5298505988757825e-05,
+      "clip_ratio/high_mean": 6.324626497189456e-06,
+      "clip_ratio/low_mean": 3.922748987861269e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.555211648948898e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 6855.6640625,
+      "completions/mean_terminated_length": 6704.4208984375,
+      "completions/min_length": 771.0,
+      "completions/min_terminated_length": 771.0,
+      "entropy": 0.8328540697693825,
+      "epoch": 0.49862005519779207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003560611279681325,
+      "learning_rate": 1e-05,
+      "loss": 0.0332,
+      "num_tokens": 477186885.0,
+      "reward": 0.515625,
+      "reward_std": 0.2743411958217621,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998643398284912,
+      "sampling/importance_sampling_ratio/min": 0.00021035241661593318,
+      "sampling/sampling_logp_difference/max": 8.466726303100586,
+      "sampling/sampling_logp_difference/mean": 0.01880962960422039,
+      "step": 542
+    },
+    {
+      "clip_ratio/high_max": 8.90761498339998e-06,
+      "clip_ratio/high_mean": 2.226903745849995e-06,
+      "clip_ratio/low_mean": 5.487640487444878e-05,
+      "clip_ratio/low_min": 6.345177553157555e-06,
+      "clip_ratio/region_mean": 5.7103308108708006e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15880.0,
+      "completions/mean_length": 7117.1015625,
+      "completions/mean_terminated_length": 6818.1689453125,
+      "completions/min_length": 1067.0,
+      "completions/min_terminated_length": 1067.0,
+      "entropy": 0.9280833601951599,
+      "epoch": 0.49954001839926404,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037869063671678305,
+      "learning_rate": 1e-05,
+      "loss": 0.0773,
+      "num_tokens": 478121506.0,
+      "reward": 0.484375,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 3.256524507833092e-07,
+      "sampling/sampling_logp_difference/max": 14.937435150146484,
+      "sampling/sampling_logp_difference/mean": 0.0203043594956398,
+      "step": 543
+    },
+    {
+      "clip_ratio/high_max": 1.3482746680892888e-05,
+      "clip_ratio/high_mean": 3.370686670223222e-06,
+      "clip_ratio/low_mean": 3.976425330165512e-05,
+      "clip_ratio/low_min": 4.979286131856497e-06,
+      "clip_ratio/region_mean": 4.313493991503492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6885.7109375,
+      "completions/mean_terminated_length": 6734.94482421875,
+      "completions/min_length": 1184.0,
+      "completions/min_terminated_length": 1184.0,
+      "entropy": 0.9137701392173767,
+      "epoch": 0.500459981600736,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002787451259791851,
+      "learning_rate": 1e-05,
+      "loss": 0.0847,
+      "num_tokens": 479021365.0,
+      "reward": 0.5,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000042915344238,
+      "sampling/importance_sampling_ratio/min": 0.0013747947523370385,
+      "sampling/sampling_logp_difference/max": 6.589450836181641,
+      "sampling/sampling_logp_difference/mean": 0.02060278132557869,
+      "step": 544
+    },
+    {
+      "clip_ratio/high_max": 2.918380459959735e-05,
+      "clip_ratio/high_mean": 8.077826691987866e-06,
+      "clip_ratio/low_mean": 4.93504342102824e-05,
+      "clip_ratio/low_min": 5.1258921303087845e-06,
+      "clip_ratio/region_mean": 5.742826124333078e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15047.0,
+      "completions/mean_length": 7055.7265625,
+      "completions/mean_terminated_length": 6982.275390625,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "entropy": 1.1009352952241898,
+      "epoch": 0.5013799448022079,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005555091425776482,
+      "learning_rate": 1e-05,
+      "loss": 0.0225,
+      "num_tokens": 479951778.0,
+      "reward": 0.28125,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 2.7657671353154e-07,
+      "sampling/sampling_logp_difference/max": 15.100777626037598,
+      "sampling/sampling_logp_difference/mean": 0.02176634594798088,
+      "step": 545
+    },
+    {
+      "clip_ratio/high_max": 9.75229158939328e-06,
+      "clip_ratio/high_mean": 2.43807289734832e-06,
+      "clip_ratio/low_mean": 3.58120408918694e-05,
+      "clip_ratio/low_min": 5.571651399804978e-06,
+      "clip_ratio/region_mean": 3.825011424396507e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16100.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 6088.2109375,
+      "completions/mean_terminated_length": 6088.2109375,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.7534168809652328,
+      "epoch": 0.5022999080036799,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.00568060576915741,
+      "learning_rate": 1e-05,
+      "loss": 0.1423,
+      "num_tokens": 480749677.0,
+      "reward": 0.6484375,
+      "reward_std": 0.3729842007160187,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999527931213379,
+      "sampling/importance_sampling_ratio/min": 0.0002166072663385421,
+      "sampling/sampling_logp_difference/max": 8.437424659729004,
+      "sampling/sampling_logp_difference/mean": 0.017093103379011154,
+      "step": 546
+    },
+    {
+      "clip_ratio/high_max": 1.821310434024781e-05,
+      "clip_ratio/high_mean": 4.5532760850619525e-06,
+      "clip_ratio/low_mean": 2.870424191314669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.325751754346129e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16029.0,
+      "completions/mean_length": 5638.8515625,
+      "completions/mean_terminated_length": 5380.96826171875,
+      "completions/min_length": 1352.0,
+      "completions/min_terminated_length": 1352.0,
+      "entropy": 0.8868100792169571,
+      "epoch": 0.5032198712051518,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019015485886484385,
+      "learning_rate": 1e-05,
+      "loss": 0.1025,
+      "num_tokens": 481489954.0,
+      "reward": 0.59375,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911904335022,
+      "sampling/importance_sampling_ratio/min": 0.0001796126161934808,
+      "sampling/sampling_logp_difference/max": 8.62470817565918,
+      "sampling/sampling_logp_difference/mean": 0.019102448597550392,
+      "step": 547
+    },
+    {
+      "clip_ratio/high_max": 2.3414544557454064e-05,
+      "clip_ratio/high_mean": 7.0229532411758555e-06,
+      "clip_ratio/low_mean": 3.169551814607985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8718471842003055e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15258.0,
+      "completions/mean_length": 6776.59375,
+      "completions/mean_terminated_length": 6624.095703125,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.9075161814689636,
+      "epoch": 0.5041398344066237,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004203350283205509,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 482375358.0,
+      "reward": 0.453125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999104738235474,
+      "sampling/importance_sampling_ratio/min": 0.0036098493728786707,
+      "sampling/sampling_logp_difference/max": 5.6320695877075195,
+      "sampling/sampling_logp_difference/mean": 0.019327163696289062,
+      "step": 548
+    },
+    {
+      "clip_ratio/high_max": 1.8746226487564854e-05,
+      "clip_ratio/high_mean": 5.84939061809564e-06,
+      "clip_ratio/low_mean": 3.6077018648938974e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.192640903966094e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15684.0,
+      "completions/mean_length": 7507.59375,
+      "completions/mean_terminated_length": 7071.048828125,
+      "completions/min_length": 774.0,
+      "completions/min_terminated_length": 774.0,
+      "entropy": 0.8015655726194382,
+      "epoch": 0.5050597976080957,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004891456104815006,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 483357450.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999200701713562,
+      "sampling/importance_sampling_ratio/min": 0.0032753932755440474,
+      "sampling/sampling_logp_difference/max": 5.721317291259766,
+      "sampling/sampling_logp_difference/mean": 0.019086822867393494,
+      "step": 549
+    },
+    {
+      "clip_ratio/high_max": 2.4045971031227964e-05,
+      "clip_ratio/high_mean": 6.011492757806991e-06,
+      "clip_ratio/low_mean": 3.096040018135682e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.697189299600723e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16107.0,
+      "completions/mean_length": 6061.3125,
+      "completions/mean_terminated_length": 5813.568359375,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.8335569724440575,
+      "epoch": 0.5059797608095676,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003564947983250022,
+      "learning_rate": 1e-05,
+      "loss": 0.028,
+      "num_tokens": 484153554.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26143792271614075,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999876022338867,
+      "sampling/importance_sampling_ratio/min": 0.02006213553249836,
+      "sampling/sampling_logp_difference/max": 3.908921003341675,
+      "sampling/sampling_logp_difference/mean": 0.018360145390033722,
+      "step": 550
+    },
+    {
+      "clip_ratio/high_max": 9.095339009945747e-06,
+      "clip_ratio/high_mean": 2.2738347524864366e-06,
+      "clip_ratio/low_mean": 4.612986276697484e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.840369865632965e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15957.0,
+      "completions/mean_length": 7312.4921875,
+      "completions/mean_terminated_length": 7241.06298828125,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.9900097697973251,
+      "epoch": 0.5068997240110396,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0032013265881687403,
+      "learning_rate": 1e-05,
+      "loss": 0.0976,
+      "num_tokens": 485111601.0,
+      "reward": 0.3125,
+      "reward_std": 0.21040895581245422,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999306201934814,
+      "sampling/importance_sampling_ratio/min": 0.006552733480930328,
+      "sampling/sampling_logp_difference/max": 5.0278730392456055,
+      "sampling/sampling_logp_difference/mean": 0.020712960511446,
+      "step": 551
+    },
+    {
+      "clip_ratio/high_max": 1.360053283860907e-05,
+      "clip_ratio/high_mean": 4.2937051603075815e-06,
+      "clip_ratio/low_mean": 4.3424448904261226e-05,
+      "clip_ratio/low_min": 4.718405762105249e-06,
+      "clip_ratio/region_mean": 4.771815429194248e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14797.0,
+      "completions/max_terminated_length": 14797.0,
+      "completions/mean_length": 6571.4453125,
+      "completions/mean_terminated_length": 6571.4453125,
+      "completions/min_length": 951.0,
+      "completions/min_terminated_length": 951.0,
+      "entropy": 0.8801060244441032,
+      "epoch": 0.5078196872125115,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002972986316308379,
+      "learning_rate": 1e-05,
+      "loss": 0.0888,
+      "num_tokens": 485971554.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998995065689087,
+      "sampling/importance_sampling_ratio/min": 2.4590379325672984e-05,
+      "sampling/sampling_logp_difference/max": 10.613155364990234,
+      "sampling/sampling_logp_difference/mean": 0.020055105909705162,
+      "step": 552
+    },
+    {
+      "clip_ratio/high_max": 8.231255606006016e-06,
+      "clip_ratio/high_mean": 2.057813901501504e-06,
+      "clip_ratio/low_mean": 3.511405452627514e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.71718685983069e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16243.0,
+      "completions/mean_length": 6879.2890625,
+      "completions/mean_terminated_length": 6728.4208984375,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "entropy": 0.8452998399734497,
+      "epoch": 0.5087396504139834,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00798189826309681,
+      "learning_rate": 1e-05,
+      "loss": 0.0278,
+      "num_tokens": 486873791.0,
+      "reward": 0.4609375,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493956565857,
+      "sampling/importance_sampling_ratio/min": 0.005210345610976219,
+      "sampling/sampling_logp_difference/max": 5.25710916519165,
+      "sampling/sampling_logp_difference/mean": 0.02010834403336048,
+      "step": 553
+    },
+    {
+      "clip_ratio/high_max": 1.757707786964602e-05,
+      "clip_ratio/high_mean": 4.394269467411505e-06,
+      "clip_ratio/low_mean": 6.0756912262149854e-05,
+      "clip_ratio/low_min": 1.0878021839744179e-05,
+      "clip_ratio/region_mean": 6.51511809337535e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16237.0,
+      "completions/max_terminated_length": 16237.0,
+      "completions/mean_length": 7169.8828125,
+      "completions/mean_terminated_length": 7169.8828125,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.9671438857913017,
+      "epoch": 0.5096596136154554,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038661460857838392,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 487814936.0,
+      "reward": 0.3359375,
+      "reward_std": 0.23751862347126007,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 4.6830271458020434e-05,
+      "sampling/sampling_logp_difference/max": 9.96898078918457,
+      "sampling/sampling_logp_difference/mean": 0.02097059041261673,
+      "step": 554
+    },
+    {
+      "clip_ratio/high_max": 4.649260063160909e-06,
+      "clip_ratio/high_mean": 1.1623150157902273e-06,
+      "clip_ratio/low_mean": 3.180719090778439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2969506037261453e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15458.0,
+      "completions/mean_length": 6945.0390625,
+      "completions/mean_terminated_length": 6870.71630859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9309702143073082,
+      "epoch": 0.5105795768169273,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002214127918705344,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 488720293.0,
+      "reward": 0.375,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914169311523,
+      "sampling/importance_sampling_ratio/min": 0.00032080389792099595,
+      "sampling/sampling_logp_difference/max": 8.04468059539795,
+      "sampling/sampling_logp_difference/mean": 0.01968962326645851,
+      "step": 555
+    },
+    {
+      "clip_ratio/high_max": 1.5428002825501608e-05,
+      "clip_ratio/high_mean": 3.857000706375402e-06,
+      "clip_ratio/low_mean": 5.9988536690980254e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.384553716998198e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5970.1015625,
+      "completions/mean_terminated_length": 5804.8017578125,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.8274230882525444,
+      "epoch": 0.5114995400183993,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026088031008839607,
+      "learning_rate": 1e-05,
+      "loss": 0.0919,
+      "num_tokens": 489504626.0,
+      "reward": 0.484375,
+      "reward_std": 0.3237725496292114,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999892711639404,
+      "sampling/importance_sampling_ratio/min": 0.00033548183273524046,
+      "sampling/sampling_logp_difference/max": 7.999942779541016,
+      "sampling/sampling_logp_difference/mean": 0.018132124096155167,
+      "step": 556
+    },
+    {
+      "clip_ratio/high_max": 1.628765676287003e-05,
+      "clip_ratio/high_mean": 5.032566036788921e-06,
+      "clip_ratio/low_mean": 3.257978141846252e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.761234722787776e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15636.0,
+      "completions/mean_length": 7099.578125,
+      "completions/mean_terminated_length": 6952.20654296875,
+      "completions/min_length": 567.0,
+      "completions/min_terminated_length": 567.0,
+      "entropy": 0.8690815567970276,
+      "epoch": 0.5124195032198712,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0040014018304646015,
+      "learning_rate": 1e-05,
+      "loss": 0.0021,
+      "num_tokens": 490431156.0,
+      "reward": 0.4609375,
+      "reward_std": 0.25460803508758545,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368786811829,
+      "sampling/importance_sampling_ratio/min": 0.0007102031959220767,
+      "sampling/sampling_logp_difference/max": 7.249959468841553,
+      "sampling/sampling_logp_difference/mean": 0.02036934345960617,
+      "step": 557
+    },
+    {
+      "clip_ratio/high_max": 1.3314914440343273e-05,
+      "clip_ratio/high_mean": 3.3287286100858182e-06,
+      "clip_ratio/low_mean": 3.747020150512981e-05,
+      "clip_ratio/low_min": 3.852436293527717e-06,
+      "clip_ratio/region_mean": 4.079892983099853e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7253.296875,
+      "completions/mean_terminated_length": 6725.07421875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8692722395062447,
+      "epoch": 0.5133394664213431,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002252641599625349,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 491378450.0,
+      "reward": 0.328125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999855756759644,
+      "sampling/importance_sampling_ratio/min": 1.893525586638134e-05,
+      "sampling/sampling_logp_difference/max": 10.87448501586914,
+      "sampling/sampling_logp_difference/mean": 0.01926814392209053,
+      "step": 558
+    },
+    {
+      "clip_ratio/high_max": 3.51339258486405e-05,
+      "clip_ratio/high_mean": 1.0567253070803417e-05,
+      "clip_ratio/low_mean": 3.905345306520758e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962070602232416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 7827.0234375,
+      "completions/mean_terminated_length": 7406.18798828125,
+      "completions/min_length": 808.0,
+      "completions/min_terminated_length": 808.0,
+      "entropy": 0.9718392416834831,
+      "epoch": 0.5142594296228151,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023995323572307825,
+      "learning_rate": 1e-05,
+      "loss": 0.0684,
+      "num_tokens": 492398757.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999961256980896,
+      "sampling/importance_sampling_ratio/min": 0.0003522284678183496,
+      "sampling/sampling_logp_difference/max": 7.951230525970459,
+      "sampling/sampling_logp_difference/mean": 0.020725054666399956,
+      "step": 559
+    },
+    {
+      "clip_ratio/high_max": 9.237001677320222e-06,
+      "clip_ratio/high_mean": 2.3092504193300556e-06,
+      "clip_ratio/low_mean": 4.477454979223694e-05,
+      "clip_ratio/low_min": 3.5987793580716243e-06,
+      "clip_ratio/region_mean": 4.708380049578409e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14833.0,
+      "completions/max_terminated_length": 14833.0,
+      "completions/mean_length": 6578.53125,
+      "completions/mean_terminated_length": 6578.53125,
+      "completions/min_length": 80.0,
+      "completions/min_terminated_length": 80.0,
+      "entropy": 0.9265799149870872,
+      "epoch": 0.515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0053934333845973015,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 493259049.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999976396560669,
+      "sampling/importance_sampling_ratio/min": 1.5993017541404697e-06,
+      "sampling/sampling_logp_difference/max": 13.345943450927734,
+      "sampling/sampling_logp_difference/mean": 0.019497254863381386,
+      "step": 560
+    },
+    {
+      "clip_ratio/high_max": 6.991247119003674e-06,
+      "clip_ratio/high_mean": 2.580789669082151e-06,
+      "clip_ratio/low_mean": 4.2538599473118666e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.511938891482714e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15783.0,
+      "completions/mean_length": 7893.7734375,
+      "completions/mean_terminated_length": 7826.92138671875,
+      "completions/min_length": 763.0,
+      "completions/min_terminated_length": 763.0,
+      "entropy": 0.9697273746132851,
+      "epoch": 0.516099356025759,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003773769596591592,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 494288028.0,
+      "reward": 0.296875,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000444650650024,
+      "sampling/importance_sampling_ratio/min": 4.6216489863581955e-05,
+      "sampling/sampling_logp_difference/max": 9.982173919677734,
+      "sampling/sampling_logp_difference/mean": 0.020743828266859055,
+      "step": 561
+    },
+    {
+      "clip_ratio/high_max": 1.060595786839258e-05,
+      "clip_ratio/high_mean": 4.29665919909894e-06,
+      "clip_ratio/low_mean": 3.2997783137034276e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729444244982005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15024.0,
+      "completions/mean_length": 6483.7734375,
+      "completions/mean_terminated_length": 6405.81884765625,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.8293593674898148,
+      "epoch": 0.5170193192272309,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.006334445904940367,
+      "learning_rate": 1e-05,
+      "loss": 0.0217,
+      "num_tokens": 495135903.0,
+      "reward": 0.5,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999064207077026,
+      "sampling/importance_sampling_ratio/min": 0.0001236602693097666,
+      "sampling/sampling_logp_difference/max": 8.99797248840332,
+      "sampling/sampling_logp_difference/mean": 0.018669776618480682,
+      "step": 562
+    },
+    {
+      "clip_ratio/high_max": 9.357276894661481e-06,
+      "clip_ratio/high_mean": 2.3393192236653704e-06,
+      "clip_ratio/low_mean": 4.667806888392079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.901738748230855e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16230.0,
+      "completions/mean_length": 6484.546875,
+      "completions/mean_terminated_length": 6246.96044921875,
+      "completions/min_length": 630.0,
+      "completions/min_terminated_length": 630.0,
+      "entropy": 0.7686850279569626,
+      "epoch": 0.5179392824287029,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003286323742941022,
+      "learning_rate": 1e-05,
+      "loss": 0.0865,
+      "num_tokens": 495986277.0,
+      "reward": 0.59375,
+      "reward_std": 0.3763991594314575,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945342540741,
+      "sampling/importance_sampling_ratio/min": 2.0216441043885425e-05,
+      "sampling/sampling_logp_difference/max": 10.809014320373535,
+      "sampling/sampling_logp_difference/mean": 0.018656805157661438,
+      "step": 563
+    },
+    {
+      "clip_ratio/high_max": 3.368905208844808e-05,
+      "clip_ratio/high_mean": 9.76577109668142e-06,
+      "clip_ratio/low_mean": 8.26880966542376e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8034580989478854e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 6411.3203125,
+      "completions/mean_terminated_length": 5746.47509765625,
+      "completions/min_length": 952.0,
+      "completions/min_terminated_length": 952.0,
+      "entropy": 0.899998240172863,
+      "epoch": 0.5188592456301748,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005072349216789007,
+      "learning_rate": 1e-05,
+      "loss": -0.0049,
+      "num_tokens": 496826094.0,
+      "reward": 0.515625,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999135732650757,
+      "sampling/importance_sampling_ratio/min": 0.0038024066016077995,
+      "sampling/sampling_logp_difference/max": 5.5721211433410645,
+      "sampling/sampling_logp_difference/mean": 0.019648944959044456,
+      "step": 564
+    },
+    {
+      "clip_ratio/high_max": 1.726673963275971e-05,
+      "clip_ratio/high_mean": 6.2551004020861e-06,
+      "clip_ratio/low_mean": 4.834715275592316e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4602252930635586e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 7110.0546875,
+      "completions/mean_terminated_length": 6810.89501953125,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 1.0061073675751686,
+      "epoch": 0.5197792088316467,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005030680447816849,
+      "learning_rate": 1e-05,
+      "loss": 0.0871,
+      "num_tokens": 497756469.0,
+      "reward": 0.375,
+      "reward_std": 0.3253750801086426,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999985933303833,
+      "sampling/importance_sampling_ratio/min": 0.0004307488852646202,
+      "sampling/sampling_logp_difference/max": 7.749985218048096,
+      "sampling/sampling_logp_difference/mean": 0.02187274768948555,
+      "step": 565
+    },
+    {
+      "clip_ratio/high_max": 3.3920382520591374e-06,
+      "clip_ratio/high_mean": 8.480095630147844e-07,
+      "clip_ratio/low_mean": 2.627351494766117e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.712152416961544e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7546.484375,
+      "completions/mean_terminated_length": 7261.40283203125,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "entropy": 0.898541085422039,
+      "epoch": 0.5206991720331187,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002894402015954256,
+      "learning_rate": 1e-05,
+      "loss": -0.0016,
+      "num_tokens": 498743411.0,
+      "reward": 0.25,
+      "reward_std": 0.2380426526069641,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998988509178162,
+      "sampling/importance_sampling_ratio/min": 3.340166585985571e-05,
+      "sampling/sampling_logp_difference/max": 10.306904792785645,
+      "sampling/sampling_logp_difference/mean": 0.019597206264734268,
+      "step": 566
+    },
+    {
+      "clip_ratio/high_max": 3.407480107853189e-06,
+      "clip_ratio/high_mean": 8.518700269632973e-07,
+      "clip_ratio/low_mean": 1.9815101950371172e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.066697197733447e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15426.0,
+      "completions/mean_length": 6637.9296875,
+      "completions/mean_terminated_length": 6241.74755859375,
+      "completions/min_length": 340.0,
+      "completions/min_terminated_length": 340.0,
+      "entropy": 0.9469815120100975,
+      "epoch": 0.5216191352345906,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033100086729973555,
+      "learning_rate": 1e-05,
+      "loss": 0.0352,
+      "num_tokens": 499612490.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999792575836182,
+      "sampling/importance_sampling_ratio/min": 0.000214192972634919,
+      "sampling/sampling_logp_difference/max": 8.448633193969727,
+      "sampling/sampling_logp_difference/mean": 0.019627269357442856,
+      "step": 567
+    },
+    {
+      "clip_ratio/high_max": 2.8962323767700582e-05,
+      "clip_ratio/high_mean": 7.2405809419251455e-06,
+      "clip_ratio/low_mean": 6.551078422489809e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.275136522366665e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15136.0,
+      "completions/mean_length": 6903.0859375,
+      "completions/mean_terminated_length": 6752.595703125,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.976447619497776,
+      "epoch": 0.5225390984360626,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006571728736162186,
+      "learning_rate": 1e-05,
+      "loss": 0.0543,
+      "num_tokens": 500515117.0,
+      "reward": 0.40625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999945163726807,
+      "sampling/importance_sampling_ratio/min": 0.016446342691779137,
+      "sampling/sampling_logp_difference/max": 4.107652187347412,
+      "sampling/sampling_logp_difference/mean": 0.020653847604990005,
+      "step": 568
+    },
+    {
+      "clip_ratio/high_max": 1.4576415196643211e-05,
+      "clip_ratio/high_mean": 3.6441037991608027e-06,
+      "clip_ratio/low_mean": 7.513643731726916e-05,
+      "clip_ratio/low_min": 2.2551557776750997e-05,
+      "clip_ratio/region_mean": 7.878054020693526e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15556.0,
+      "completions/mean_length": 6953.8359375,
+      "completions/mean_terminated_length": 6570.49560546875,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "entropy": 0.8397975340485573,
+      "epoch": 0.5234590616375345,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007468517404049635,
+      "learning_rate": 1e-05,
+      "loss": 0.0618,
+      "num_tokens": 501427056.0,
+      "reward": 0.421875,
+      "reward_std": 0.3571978807449341,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000053644180298,
+      "sampling/importance_sampling_ratio/min": 0.0001911464933073148,
+      "sampling/sampling_logp_difference/max": 8.562470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01937997341156006,
+      "step": 569
+    },
+    {
+      "clip_ratio/high_max": 3.168922489749093e-05,
+      "clip_ratio/high_mean": 7.922306224372733e-06,
+      "clip_ratio/low_mean": 3.7468206755875144e-05,
+      "clip_ratio/low_min": 5.264044375508092e-06,
+      "clip_ratio/region_mean": 4.5390514060272835e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15961.0,
+      "completions/mean_length": 7807.09375,
+      "completions/mean_terminated_length": 7458.43896484375,
+      "completions/min_length": 562.0,
+      "completions/min_terminated_length": 562.0,
+      "entropy": 0.7974586114287376,
+      "epoch": 0.5243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004324767272919416,
+      "learning_rate": 1e-05,
+      "loss": 0.0431,
+      "num_tokens": 502445156.0,
+      "reward": 0.265625,
+      "reward_std": 0.3329663574695587,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999243021011353,
+      "sampling/importance_sampling_ratio/min": 2.9874459869461134e-05,
+      "sampling/sampling_logp_difference/max": 10.418506622314453,
+      "sampling/sampling_logp_difference/mean": 0.018592730164527893,
+      "step": 570
+    },
+    {
+      "clip_ratio/high_max": 1.8414293663227e-05,
+      "clip_ratio/high_mean": 5.567038670051261e-06,
+      "clip_ratio/low_mean": 3.436269958001503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9929738250066293e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 6467.890625,
+      "completions/mean_terminated_length": 6310.4921875,
+      "completions/min_length": 874.0,
+      "completions/min_terminated_length": 874.0,
+      "entropy": 0.8665193468332291,
+      "epoch": 0.5252989880404784,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0044867550022900105,
+      "learning_rate": 1e-05,
+      "loss": 0.0434,
+      "num_tokens": 503293398.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.024881144985556602,
+      "sampling/sampling_logp_difference/max": 3.6936450004577637,
+      "sampling/sampling_logp_difference/mean": 0.019022464752197266,
+      "step": 571
+    },
+    {
+      "clip_ratio/high_max": 1.4845849818811985e-05,
+      "clip_ratio/high_mean": 3.711462454702996e-06,
+      "clip_ratio/low_mean": 3.597185968828853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.968332202930469e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16309.0,
+      "completions/mean_length": 6275.796875,
+      "completions/mean_terminated_length": 6115.349609375,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 0.8425783589482307,
+      "epoch": 0.5262189512419503,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033805551938712597,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 504115692.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2569621503353119,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000152587890625,
+      "sampling/importance_sampling_ratio/min": 0.018389537930488586,
+      "sampling/sampling_logp_difference/max": 3.9959733486175537,
+      "sampling/sampling_logp_difference/mean": 0.018935590982437134,
+      "step": 572
+    },
+    {
+      "clip_ratio/high_max": 4.3129479763592826e-05,
+      "clip_ratio/high_mean": 1.3471904480866215e-05,
+      "clip_ratio/low_mean": 1.670091853611666e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0172822903296037e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16116.0,
+      "completions/mean_length": 5396.7890625,
+      "completions/mean_terminated_length": 5222.38916015625,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.8558806329965591,
+      "epoch": 0.5271389144434223,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00652205478399992,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 504826577.0,
+      "reward": 0.546875,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.0017056812066584826,
+      "sampling/sampling_logp_difference/max": 6.373790740966797,
+      "sampling/sampling_logp_difference/mean": 0.018737314268946648,
+      "step": 573
+    },
+    {
+      "clip_ratio/high_max": 6.914692676218692e-06,
+      "clip_ratio/high_mean": 1.728673169054673e-06,
+      "clip_ratio/low_mean": 2.3435458388121333e-05,
+      "clip_ratio/low_min": 3.954319709009724e-06,
+      "clip_ratio/region_mean": 2.5164132239297032e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16298.0,
+      "completions/mean_length": 7798.9765625,
+      "completions/mean_terminated_length": 6991.837890625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 0.8846152648329735,
+      "epoch": 0.5280588776448942,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018958896398544312,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 505846438.0,
+      "reward": 0.328125,
+      "reward_std": 0.21253062784671783,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999515414237976,
+      "sampling/importance_sampling_ratio/min": 2.434831731079612e-05,
+      "sampling/sampling_logp_difference/max": 10.623047828674316,
+      "sampling/sampling_logp_difference/mean": 0.019361287355422974,
+      "step": 574
+    },
+    {
+      "clip_ratio/high_max": 1.085428675651201e-05,
+      "clip_ratio/high_mean": 5.064732249593362e-06,
+      "clip_ratio/low_mean": 5.590463968019321e-05,
+      "clip_ratio/low_min": 4.822531082027126e-06,
+      "clip_ratio/region_mean": 6.096937283928128e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16280.0,
+      "completions/mean_length": 6272.5546875,
+      "completions/mean_terminated_length": 6029.88037109375,
+      "completions/min_length": 901.0,
+      "completions/min_terminated_length": 901.0,
+      "entropy": 0.9714803844690323,
+      "epoch": 0.5289788408463661,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003035407979041338,
+      "learning_rate": 1e-05,
+      "loss": 0.1295,
+      "num_tokens": 506670477.0,
+      "reward": 0.3984375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212026596069,
+      "sampling/importance_sampling_ratio/min": 0.0012103202752768993,
+      "sampling/sampling_logp_difference/max": 6.716870307922363,
+      "sampling/sampling_logp_difference/mean": 0.019988738000392914,
+      "step": 575
+    },
+    {
+      "clip_ratio/high_max": 2.1176599602767965e-05,
+      "clip_ratio/high_mean": 5.294149900691991e-06,
+      "clip_ratio/low_mean": 4.479086726405512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.008501784686814e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6060.75,
+      "completions/mean_terminated_length": 5896.88916015625,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 0.8791732639074326,
+      "epoch": 0.5298988040478381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005080445669591427,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 507471717.0,
+      "reward": 0.421875,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999859929084778,
+      "sampling/importance_sampling_ratio/min": 0.0025768836494535208,
+      "sampling/sampling_logp_difference/max": 5.961174488067627,
+      "sampling/sampling_logp_difference/mean": 0.019146449863910675,
+      "step": 576
+    },
+    {
+      "clip_ratio/high_max": 1.591328441463702e-05,
+      "clip_ratio/high_mean": 3.978321103659255e-06,
+      "clip_ratio/low_mean": 3.991827338722942e-05,
+      "clip_ratio/low_min": 4.394445568323135e-06,
+      "clip_ratio/region_mean": 4.389659511616628e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7221.65625,
+      "completions/mean_terminated_length": 7149.51171875,
+      "completions/min_length": 1071.0,
+      "completions/min_terminated_length": 1071.0,
+      "entropy": 0.9068904295563698,
+      "epoch": 0.53081876724931,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002491918858140707,
+      "learning_rate": 1e-05,
+      "loss": 0.0263,
+      "num_tokens": 508420417.0,
+      "reward": 0.3046875,
+      "reward_std": 0.22908622026443481,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999144077301025,
+      "sampling/importance_sampling_ratio/min": 0.0010015364969149232,
+      "sampling/sampling_logp_difference/max": 6.906219959259033,
+      "sampling/sampling_logp_difference/mean": 0.019857721403241158,
+      "step": 577
+    },
+    {
+      "clip_ratio/high_max": 2.723786337810452e-06,
+      "clip_ratio/high_mean": 6.80946584452613e-07,
+      "clip_ratio/low_mean": 4.729307283923845e-05,
+      "clip_ratio/low_min": 3.3817600524344016e-06,
+      "clip_ratio/region_mean": 4.7974018798413454e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16090.0,
+      "completions/mean_length": 7279.765625,
+      "completions/mean_terminated_length": 6909.67431640625,
+      "completions/min_length": 754.0,
+      "completions/min_terminated_length": 754.0,
+      "entropy": 0.7393763959407806,
+      "epoch": 0.531738730450782,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0038857783656567335,
+      "learning_rate": 1e-05,
+      "loss": 0.1167,
+      "num_tokens": 509367579.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3782213628292084,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372959136963,
+      "sampling/importance_sampling_ratio/min": 8.482332486892119e-05,
+      "sampling/sampling_logp_difference/max": 9.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01783195324242115,
+      "step": 578
+    },
+    {
+      "clip_ratio/high_max": 2.4269288587674964e-05,
+      "clip_ratio/high_mean": 6.067322146918741e-06,
+      "clip_ratio/low_mean": 5.770765028501046e-05,
+      "clip_ratio/low_min": 6.032236342434771e-06,
+      "clip_ratio/region_mean": 6.377497174980817e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15946.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 5381.4375,
+      "completions/mean_terminated_length": 5381.4375,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.8337196409702301,
+      "epoch": 0.5326586936522539,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.004505726508796215,
+      "learning_rate": 1e-05,
+      "loss": 0.1534,
+      "num_tokens": 510076403.0,
+      "reward": 0.484375,
+      "reward_std": 0.3861297369003296,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999825358390808,
+      "sampling/importance_sampling_ratio/min": 0.0021874941885471344,
+      "sampling/sampling_logp_difference/max": 6.124998569488525,
+      "sampling/sampling_logp_difference/mean": 0.019285976886749268,
+      "step": 579
+    },
+    {
+      "clip_ratio/high_max": 1.83111833393923e-05,
+      "clip_ratio/high_mean": 4.577795834848075e-06,
+      "clip_ratio/low_mean": 4.1738339632502175e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.631613546735025e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15789.0,
+      "completions/mean_length": 8440.7109375,
+      "completions/mean_terminated_length": 8250.072265625,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "entropy": 0.8920768201351166,
+      "epoch": 0.5335786568537259,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0039497604593634605,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 511177974.0,
+      "reward": 0.1875,
+      "reward_std": 0.18990950286388397,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910831451416,
+      "sampling/importance_sampling_ratio/min": 0.00021938055579084903,
+      "sampling/sampling_logp_difference/max": 8.424702644348145,
+      "sampling/sampling_logp_difference/mean": 0.020451124757528305,
+      "step": 580
+    },
+    {
+      "clip_ratio/high_max": 1.371111534353986e-05,
+      "clip_ratio/high_mean": 3.427778835884965e-06,
+      "clip_ratio/low_mean": 4.171912905803765e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.514690772339236e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16077.0,
+      "completions/mean_length": 6702.3828125,
+      "completions/mean_terminated_length": 6470.0244140625,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.8600481152534485,
+      "epoch": 0.5344986200551978,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024386425502598286,
+      "learning_rate": 1e-05,
+      "loss": 0.0866,
+      "num_tokens": 512054655.0,
+      "reward": 0.5703125,
+      "reward_std": 0.26645052433013916,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000202655792236,
+      "sampling/importance_sampling_ratio/min": 0.0015237311599776149,
+      "sampling/sampling_logp_difference/max": 6.486593246459961,
+      "sampling/sampling_logp_difference/mean": 0.018986206501722336,
+      "step": 581
+    },
+    {
+      "clip_ratio/high_max": 9.279537152906414e-06,
+      "clip_ratio/high_mean": 4.2680171645770315e-06,
+      "clip_ratio/low_mean": 2.6773893978315755e-05,
+      "clip_ratio/low_min": 4.736104074254399e-06,
+      "clip_ratio/region_mean": 3.1041911142892786e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13410.0,
+      "completions/mean_length": 4845.953125,
+      "completions/mean_terminated_length": 4755.1025390625,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "entropy": 0.9067303538322449,
+      "epoch": 0.5354185832566697,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0072782449424266815,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 512696537.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999409317970276,
+      "sampling/importance_sampling_ratio/min": 0.017822081223130226,
+      "sampling/sampling_logp_difference/max": 4.027317047119141,
+      "sampling/sampling_logp_difference/mean": 0.01862735114991665,
+      "step": 582
+    },
+    {
+      "clip_ratio/high_max": 8.41807559481822e-06,
+      "clip_ratio/high_mean": 2.104518898704555e-06,
+      "clip_ratio/low_mean": 4.360654588708712e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5711064331044327e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16282.0,
+      "completions/mean_length": 6173.171875,
+      "completions/mean_terminated_length": 6011.095703125,
+      "completions/min_length": 756.0,
+      "completions/min_terminated_length": 756.0,
+      "entropy": 0.9604142308235168,
+      "epoch": 0.5363385464581417,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005057654343545437,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 513505135.0,
+      "reward": 0.4375,
+      "reward_std": 0.2767051160335541,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999635219573975,
+      "sampling/importance_sampling_ratio/min": 0.0002380619989708066,
+      "sampling/sampling_logp_difference/max": 8.342979431152344,
+      "sampling/sampling_logp_difference/mean": 0.020879898220300674,
+      "step": 583
+    },
+    {
+      "clip_ratio/high_max": 7.327939783863258e-06,
+      "clip_ratio/high_mean": 3.227510205761064e-06,
+      "clip_ratio/low_mean": 4.2579683963595016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.580719428304292e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15173.0,
+      "completions/mean_length": 5546.5234375,
+      "completions/mean_terminated_length": 5374.50048828125,
+      "completions/min_length": 1113.0,
+      "completions/min_terminated_length": 1113.0,
+      "entropy": 0.8015405982732773,
+      "epoch": 0.5372585096596136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0047672707587480545,
+      "learning_rate": 1e-05,
+      "loss": 0.0991,
+      "num_tokens": 514232058.0,
+      "reward": 0.4921875,
+      "reward_std": 0.27038949728012085,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 5.8323133998783305e-05,
+      "sampling/sampling_logp_difference/max": 9.74951171875,
+      "sampling/sampling_logp_difference/mean": 0.018185433000326157,
+      "step": 584
+    },
+    {
+      "clip_ratio/high_max": 1.3804907666781219e-05,
+      "clip_ratio/high_mean": 4.388961428958282e-06,
+      "clip_ratio/low_mean": 5.04182496570138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.480721097228525e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15778.0,
+      "completions/mean_length": 6637.359375,
+      "completions/mean_terminated_length": 6482.6513671875,
+      "completions/min_length": 1144.0,
+      "completions/min_terminated_length": 1144.0,
+      "entropy": 1.0173144191503525,
+      "epoch": 0.5381784728610856,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005850035231560469,
+      "learning_rate": 1e-05,
+      "loss": 0.0453,
+      "num_tokens": 515103184.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999963104724884,
+      "sampling/importance_sampling_ratio/min": 1.4479226706498594e-07,
+      "sampling/sampling_logp_difference/max": 15.747965812683105,
+      "sampling/sampling_logp_difference/mean": 0.020641878247261047,
+      "step": 585
+    },
+    {
+      "clip_ratio/high_max": 1.594428704265738e-05,
+      "clip_ratio/high_mean": 3.986071760664345e-06,
+      "clip_ratio/low_mean": 5.566071547491447e-05,
+      "clip_ratio/low_min": 8.978264304460026e-06,
+      "clip_ratio/region_mean": 5.964678746295249e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 6940.6171875,
+      "completions/mean_terminated_length": 6866.259765625,
+      "completions/min_length": 1273.0,
+      "completions/min_terminated_length": 1273.0,
+      "entropy": 0.8547529205679893,
+      "epoch": 0.5390984360625575,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037875184789299965,
+      "learning_rate": 1e-05,
+      "loss": 0.0831,
+      "num_tokens": 516009791.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27222442626953125,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999997615814209,
+      "sampling/importance_sampling_ratio/min": 5.772008080384694e-06,
+      "sampling/sampling_logp_difference/max": 12.062490463256836,
+      "sampling/sampling_logp_difference/mean": 0.018527517095208168,
+      "step": 586
+    },
+    {
+      "clip_ratio/high_max": 6.924382887518732e-06,
+      "clip_ratio/high_mean": 1.731095721879683e-06,
+      "clip_ratio/low_mean": 3.340147941344185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5132575476382044e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15387.0,
+      "completions/mean_length": 6837.125,
+      "completions/mean_terminated_length": 6761.95263671875,
+      "completions/min_length": 1319.0,
+      "completions/min_terminated_length": 1319.0,
+      "entropy": 0.9027494043111801,
+      "epoch": 0.5400183992640294,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015506440540775657,
+      "learning_rate": 1e-05,
+      "loss": 0.0502,
+      "num_tokens": 516903335.0,
+      "reward": 0.296875,
+      "reward_std": 0.20593318343162537,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 4.2636147554730996e-05,
+      "sampling/sampling_logp_difference/max": 10.0628080368042,
+      "sampling/sampling_logp_difference/mean": 0.020130250602960587,
+      "step": 587
+    },
+    {
+      "clip_ratio/high_max": 1.2774215747413109e-05,
+      "clip_ratio/high_mean": 3.1935539368532773e-06,
+      "clip_ratio/low_mean": 3.885528553837503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.204883930469805e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7866.703125,
+      "completions/mean_terminated_length": 7222.5380859375,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "entropy": 0.8133657574653625,
+      "epoch": 0.5409383624655014,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003520917845889926,
+      "learning_rate": 1e-05,
+      "loss": 0.1165,
+      "num_tokens": 517929081.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3316730856895447,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 6.223546370165423e-05,
+      "sampling/sampling_logp_difference/max": 9.684585571289062,
+      "sampling/sampling_logp_difference/mean": 0.01890747994184494,
+      "step": 588
+    },
+    {
+      "clip_ratio/high_max": 6.942207619431429e-06,
+      "clip_ratio/high_mean": 1.7355519048578572e-06,
+      "clip_ratio/low_mean": 3.457626269209868e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.631181459695654e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 6701.296875,
+      "completions/mean_terminated_length": 6547.603515625,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9360691756010056,
+      "epoch": 0.5418583256669733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029796145390719175,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 518810247.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2869499921798706,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 2.520391673144218e-10,
+      "sampling/sampling_logp_difference/max": 22.101436614990234,
+      "sampling/sampling_logp_difference/mean": 0.01977725327014923,
+      "step": 589
+    },
+    {
+      "clip_ratio/high_max": 3.7906356737948954e-06,
+      "clip_ratio/high_mean": 9.476589184487239e-07,
+      "clip_ratio/low_mean": 3.738725240509666e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8334911323545384e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15971.0,
+      "completions/mean_length": 7029.453125,
+      "completions/mean_terminated_length": 6804.9443359375,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.9168537557125092,
+      "epoch": 0.5427782888684453,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024249793495982885,
+      "learning_rate": 1e-05,
+      "loss": 0.0477,
+      "num_tokens": 519730577.0,
+      "reward": 0.390625,
+      "reward_std": 0.22803518176078796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 1.6278204384434503e-07,
+      "sampling/sampling_logp_difference/max": 15.630853652954102,
+      "sampling/sampling_logp_difference/mean": 0.01923082396388054,
+      "step": 590
+    },
+    {
+      "clip_ratio/high_max": 2.4759768621152034e-05,
+      "clip_ratio/high_mean": 6.1899421552880085e-06,
+      "clip_ratio/low_mean": 3.2254738812298456e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8444680967586464e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 7255.453125,
+      "completions/mean_terminated_length": 6646.8837890625,
+      "completions/min_length": 832.0,
+      "completions/min_terminated_length": 832.0,
+      "entropy": 0.8241118341684341,
+      "epoch": 0.5436982520699172,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003160425927489996,
+      "learning_rate": 1e-05,
+      "loss": 0.0821,
+      "num_tokens": 520680707.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2461756467819214,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000334978103638,
+      "sampling/importance_sampling_ratio/min": 0.0009408618789166212,
+      "sampling/sampling_logp_difference/max": 6.968714237213135,
+      "sampling/sampling_logp_difference/mean": 0.019255205988883972,
+      "step": 591
+    },
+    {
+      "clip_ratio/high_max": 7.459808557541692e-06,
+      "clip_ratio/high_mean": 1.864952139385423e-06,
+      "clip_ratio/low_mean": 3.9836502310208743e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.170145416537707e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 7819.96875,
+      "completions/mean_terminated_length": 7752.53564453125,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 512.0,
+      "entropy": 1.1218742430210114,
+      "epoch": 0.5446182152713891,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00411194609478116,
+      "learning_rate": 1e-05,
+      "loss": 0.0267,
+      "num_tokens": 521703303.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999041557312012,
+      "sampling/importance_sampling_ratio/min": 0.0003571478300727904,
+      "sampling/sampling_logp_difference/max": 7.937360763549805,
+      "sampling/sampling_logp_difference/mean": 0.022727783769369125,
+      "step": 592
+    },
+    {
+      "clip_ratio/high_max": 1.8858649582398357e-05,
+      "clip_ratio/high_mean": 4.714662395599589e-06,
+      "clip_ratio/low_mean": 3.738353416338214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2098196558981726e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16117.0,
+      "completions/mean_length": 6322.8671875,
+      "completions/mean_terminated_length": 6163.1669921875,
+      "completions/min_length": 637.0,
+      "completions/min_terminated_length": 637.0,
+      "entropy": 0.8323960080742836,
+      "epoch": 0.5455381784728611,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022753921803086996,
+      "learning_rate": 1e-05,
+      "loss": 0.0339,
+      "num_tokens": 522531422.0,
+      "reward": 0.4140625,
+      "reward_std": 0.20753081142902374,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998952150344849,
+      "sampling/importance_sampling_ratio/min": 5.422274170996388e-06,
+      "sampling/sampling_logp_difference/max": 12.124995231628418,
+      "sampling/sampling_logp_difference/mean": 0.01893780007958412,
+      "step": 593
+    },
+    {
+      "clip_ratio/high_max": 3.977598225901602e-06,
+      "clip_ratio/high_mean": 9.943995564754005e-07,
+      "clip_ratio/low_mean": 1.1187657776190463e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.2182057332665863e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16055.0,
+      "completions/mean_length": 7054.0625,
+      "completions/mean_terminated_length": 6905.96875,
+      "completions/min_length": 101.0,
+      "completions/min_terminated_length": 101.0,
+      "entropy": 0.866028867661953,
+      "epoch": 0.546458141674333,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004338000901043415,
+      "learning_rate": 1e-05,
+      "loss": -0.0134,
+      "num_tokens": 523453262.0,
+      "reward": 0.328125,
+      "reward_std": 0.13204573094844818,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998721480369568,
+      "sampling/importance_sampling_ratio/min": 7.97068714746274e-05,
+      "sampling/sampling_logp_difference/max": 9.437154769897461,
+      "sampling/sampling_logp_difference/mean": 0.01982954889535904,
+      "step": 594
+    },
+    {
+      "clip_ratio/high_max": 1.5038514220577781e-05,
+      "clip_ratio/high_mean": 3.7596285551444453e-06,
+      "clip_ratio/low_mean": 3.533169467573316e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9091323742468376e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 7539.0703125,
+      "completions/mean_terminated_length": 7027.3798828125,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "entropy": 0.8601142391562462,
+      "epoch": 0.547378104875805,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003401415189728141,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 524436831.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999969482421875,
+      "sampling/importance_sampling_ratio/min": 2.0915547793265432e-05,
+      "sampling/sampling_logp_difference/max": 10.775017738342285,
+      "sampling/sampling_logp_difference/mean": 0.019884679466485977,
+      "step": 595
+    },
+    {
+      "clip_ratio/high_max": 2.9679867111553904e-05,
+      "clip_ratio/high_mean": 8.187421713046206e-06,
+      "clip_ratio/low_mean": 5.44505830930575e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.263800514716422e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16343.0,
+      "completions/mean_length": 7137.96875,
+      "completions/mean_terminated_length": 6762.11376953125,
+      "completions/min_length": 606.0,
+      "completions/min_terminated_length": 606.0,
+      "entropy": 0.7909424379467964,
+      "epoch": 0.5482980680772769,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002879115054383874,
+      "learning_rate": 1e-05,
+      "loss": 0.0549,
+      "num_tokens": 525368091.0,
+      "reward": 0.546875,
+      "reward_std": 0.27062684297561646,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000025033950806,
+      "sampling/importance_sampling_ratio/min": 0.0004618439415935427,
+      "sampling/sampling_logp_difference/max": 7.680283546447754,
+      "sampling/sampling_logp_difference/mean": 0.01847894862294197,
+      "step": 596
+    },
+    {
+      "clip_ratio/high_max": 5.765416517533595e-06,
+      "clip_ratio/high_mean": 1.4413541293833987e-06,
+      "clip_ratio/low_mean": 3.1269102407804894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2710456423501455e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5486.3671875,
+      "completions/mean_terminated_length": 5224.82421875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9588652476668358,
+      "epoch": 0.5492180312787488,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004545152187347412,
+      "learning_rate": 1e-05,
+      "loss": 0.0549,
+      "num_tokens": 526095378.0,
+      "reward": 0.359375,
+      "reward_std": 0.33508801460266113,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998891353607178,
+      "sampling/importance_sampling_ratio/min": 6.280510569922626e-05,
+      "sampling/sampling_logp_difference/max": 9.675474166870117,
+      "sampling/sampling_logp_difference/mean": 0.02017204463481903,
+      "step": 597
+    },
+    {
+      "clip_ratio/high_max": 1.519483475931338e-05,
+      "clip_ratio/high_mean": 4.732241109195456e-06,
+      "clip_ratio/low_mean": 4.477498589494644e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.950722734520241e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16169.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 6636.0078125,
+      "completions/mean_terminated_length": 6636.0078125,
+      "completions/min_length": 685.0,
+      "completions/min_terminated_length": 685.0,
+      "entropy": 0.9497648254036903,
+      "epoch": 0.5501379944802208,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004040954168885946,
+      "learning_rate": 1e-05,
+      "loss": 0.0477,
+      "num_tokens": 526969459.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474287033081,
+      "sampling/importance_sampling_ratio/min": 2.2340275407373156e-08,
+      "sampling/sampling_logp_difference/max": 17.61687469482422,
+      "sampling/sampling_logp_difference/mean": 0.02086419239640236,
+      "step": 598
+    },
+    {
+      "clip_ratio/high_max": 1.5785165032866644e-05,
+      "clip_ratio/high_mean": 3.946291258216661e-06,
+      "clip_ratio/low_mean": 4.7215530003086315e-05,
+      "clip_ratio/low_min": 5.274039267533226e-06,
+      "clip_ratio/region_mean": 5.116182205711084e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15820.0,
+      "completions/mean_length": 6462.953125,
+      "completions/mean_terminated_length": 6142.9189453125,
+      "completions/min_length": 824.0,
+      "completions/min_terminated_length": 824.0,
+      "entropy": 0.9401230812072754,
+      "epoch": 0.5510579576816927,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004678349941968918,
+      "learning_rate": 1e-05,
+      "loss": 0.1854,
+      "num_tokens": 527822197.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3345640003681183,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997877478599548,
+      "sampling/importance_sampling_ratio/min": 2.8560234568431042e-05,
+      "sampling/sampling_logp_difference/max": 10.463495254516602,
+      "sampling/sampling_logp_difference/mean": 0.019832316786050797,
+      "step": 599
+    },
+    {
+      "clip_ratio/high_max": 4.1415414671064354e-06,
+      "clip_ratio/high_mean": 1.0353853667766089e-06,
+      "clip_ratio/low_mean": 4.795687004843785e-05,
+      "clip_ratio/low_min": 7.76807610236574e-06,
+      "clip_ratio/region_mean": 4.899225518784078e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15170.0,
+      "completions/mean_length": 7172.1015625,
+      "completions/mean_terminated_length": 6951.01611328125,
+      "completions/min_length": 1079.0,
+      "completions/min_terminated_length": 1079.0,
+      "entropy": 0.7962061613798141,
+      "epoch": 0.5519779208831647,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014094997895881534,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 528759458.0,
+      "reward": 0.3515625,
+      "reward_std": 0.16834919154644012,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999281167984009,
+      "sampling/importance_sampling_ratio/min": 0.001331693259999156,
+      "sampling/sampling_logp_difference/max": 6.621304035186768,
+      "sampling/sampling_logp_difference/mean": 0.018519852310419083,
+      "step": 600
+    },
+    {
+      "clip_ratio/high_max": 7.3846517807396594e-06,
+      "clip_ratio/high_mean": 3.018199095095042e-06,
+      "clip_ratio/low_mean": 5.2064756346226204e-05,
+      "clip_ratio/low_min": 5.341652013157727e-06,
+      "clip_ratio/region_mean": 5.5082955441321246e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16195.0,
+      "completions/mean_length": 6612.6484375,
+      "completions/mean_terminated_length": 6378.13623046875,
+      "completions/min_length": 480.0,
+      "completions/min_terminated_length": 480.0,
+      "entropy": 0.8218385726213455,
+      "epoch": 0.5528978840846366,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038943374529480934,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 529626893.0,
+      "reward": 0.390625,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 0.0024450027849525213,
+      "sampling/sampling_logp_difference/max": 6.01370906829834,
+      "sampling/sampling_logp_difference/mean": 0.018441151827573776,
+      "step": 601
+    },
+    {
+      "clip_ratio/high_max": 8.209965471905889e-06,
+      "clip_ratio/high_mean": 2.0524913679764722e-06,
+      "clip_ratio/low_mean": 4.8717710285473004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.077020244925734e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15898.0,
+      "completions/mean_length": 6574.9140625,
+      "completions/mean_terminated_length": 6419.21484375,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.9268836230039597,
+      "epoch": 0.5538178472861086,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027088895440101624,
+      "learning_rate": 1e-05,
+      "loss": 0.0577,
+      "num_tokens": 530486578.0,
+      "reward": 0.4453125,
+      "reward_std": 0.26143792271614075,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000026822090149,
+      "sampling/importance_sampling_ratio/min": 1.1735714906535577e-05,
+      "sampling/sampling_logp_difference/max": 11.352873802185059,
+      "sampling/sampling_logp_difference/mean": 0.020115964114665985,
+      "step": 602
+    },
+    {
+      "clip_ratio/high_max": 5.24967435922008e-06,
+      "clip_ratio/high_mean": 1.31241858980502e-06,
+      "clip_ratio/low_mean": 1.3909025255998131e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5221443845803151e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14361.0,
+      "completions/mean_length": 6209.1953125,
+      "completions/mean_terminated_length": 6129.07861328125,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9574517607688904,
+      "epoch": 0.5547378104875805,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.002628365531563759,
+      "learning_rate": 1e-05,
+      "loss": 0.0461,
+      "num_tokens": 531303083.0,
+      "reward": 0.3671875,
+      "reward_std": 0.13098490238189697,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998608827590942,
+      "sampling/importance_sampling_ratio/min": 2.862734254449606e-05,
+      "sampling/sampling_logp_difference/max": 10.461148262023926,
+      "sampling/sampling_logp_difference/mean": 0.019658785313367844,
+      "step": 603
+    },
+    {
+      "clip_ratio/high_max": 1.9014597455679905e-05,
+      "clip_ratio/high_mean": 4.753649363919976e-06,
+      "clip_ratio/low_mean": 4.9158792762682424e-05,
+      "clip_ratio/low_min": 4.514427928370424e-06,
+      "clip_ratio/region_mean": 5.39124412171077e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13873.0,
+      "completions/mean_length": 7079.1875,
+      "completions/mean_terminated_length": 6855.87255859375,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 0.853938102722168,
+      "epoch": 0.5556577736890524,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004664157051593065,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 532228227.0,
+      "reward": 0.2734375,
+      "reward_std": 0.30327796936035156,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999879598617554,
+      "sampling/importance_sampling_ratio/min": 5.377535785555665e-07,
+      "sampling/sampling_logp_difference/max": 14.43586540222168,
+      "sampling/sampling_logp_difference/mean": 0.018260695040225983,
+      "step": 604
+    },
+    {
+      "clip_ratio/high_max": 3.025483556484687e-05,
+      "clip_ratio/high_mean": 7.563708891211718e-06,
+      "clip_ratio/low_mean": 2.1738228269896354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9301936820047558e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15094.0,
+      "completions/max_terminated_length": 15094.0,
+      "completions/mean_length": 6071.5390625,
+      "completions/mean_terminated_length": 6071.5390625,
+      "completions/min_length": 742.0,
+      "completions/min_terminated_length": 742.0,
+      "entropy": 0.980722151696682,
+      "epoch": 0.5565777368905244,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004579839296638966,
+      "learning_rate": 1e-05,
+      "loss": 0.0168,
+      "num_tokens": 533024264.0,
+      "reward": 0.4765625,
+      "reward_std": 0.30327799916267395,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999982476234436,
+      "sampling/importance_sampling_ratio/min": 0.0003390153287909925,
+      "sampling/sampling_logp_difference/max": 7.989465236663818,
+      "sampling/sampling_logp_difference/mean": 0.01974770799279213,
+      "step": 605
+    },
+    {
+      "clip_ratio/high_max": 1.3344870239961892e-05,
+      "clip_ratio/high_mean": 4.773990667672479e-06,
+      "clip_ratio/low_mean": 5.142044130934664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6194432318079635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7352.484375,
+      "completions/mean_terminated_length": 7209.12744140625,
+      "completions/min_length": 1310.0,
+      "completions/min_terminated_length": 1310.0,
+      "entropy": 0.7858814746141434,
+      "epoch": 0.5574977000919963,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002537919208407402,
+      "learning_rate": 1e-05,
+      "loss": 0.0576,
+      "num_tokens": 533985318.0,
+      "reward": 0.3125,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037981033325,
+      "sampling/importance_sampling_ratio/min": 0.0017827138071879745,
+      "sampling/sampling_logp_difference/max": 6.329618453979492,
+      "sampling/sampling_logp_difference/mean": 0.018647275865077972,
+      "step": 606
+    },
+    {
+      "clip_ratio/high_max": 2.345925531699322e-05,
+      "clip_ratio/high_mean": 7.0977013137962786e-06,
+      "clip_ratio/low_mean": 4.466222731025482e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.175992941985896e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16082.0,
+      "completions/mean_length": 7095.1875,
+      "completions/mean_terminated_length": 6947.74658203125,
+      "completions/min_length": 1073.0,
+      "completions/min_terminated_length": 1073.0,
+      "entropy": 0.6846291124820709,
+      "epoch": 0.5584176632934683,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037982286885380745,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 534912558.0,
+      "reward": 0.53125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147057533264,
+      "sampling/importance_sampling_ratio/min": 8.089523180387914e-05,
+      "sampling/sampling_logp_difference/max": 9.422355651855469,
+      "sampling/sampling_logp_difference/mean": 0.01693977229297161,
+      "step": 607
+    },
+    {
+      "clip_ratio/high_max": 5.167851668375079e-06,
+      "clip_ratio/high_mean": 1.2919629170937696e-06,
+      "clip_ratio/low_mean": 6.557838094067847e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.687034363039857e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6038.1953125,
+      "completions/mean_terminated_length": 5873.9765625,
+      "completions/min_length": 677.0,
+      "completions/min_terminated_length": 677.0,
+      "entropy": 0.8637901693582535,
+      "epoch": 0.5593376264949402,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030545955523848534,
+      "learning_rate": 1e-05,
+      "loss": 0.0716,
+      "num_tokens": 535707127.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999387264251709,
+      "sampling/importance_sampling_ratio/min": 0.00017956242663785815,
+      "sampling/sampling_logp_difference/max": 8.624987602233887,
+      "sampling/sampling_logp_difference/mean": 0.018705151975154877,
+      "step": 608
+    },
+    {
+      "clip_ratio/high_max": 1.7691760149318725e-05,
+      "clip_ratio/high_mean": 5.544901910070621e-06,
+      "clip_ratio/low_mean": 5.012885230826214e-05,
+      "clip_ratio/low_min": 3.5653165468829684e-06,
+      "clip_ratio/region_mean": 5.5673754559393274e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14906.0,
+      "completions/mean_length": 6978.0078125,
+      "completions/mean_terminated_length": 6828.70654296875,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "entropy": 0.7931060045957565,
+      "epoch": 0.5602575896964122,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002951717935502529,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 536618376.0,
+      "reward": 0.46875,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 3.865327380481176e-05,
+      "sampling/sampling_logp_difference/max": 10.160879135131836,
+      "sampling/sampling_logp_difference/mean": 0.018486514687538147,
+      "step": 609
+    },
+    {
+      "clip_ratio/high_max": 2.1591150925814873e-05,
+      "clip_ratio/high_mean": 5.397787731453718e-06,
+      "clip_ratio/low_mean": 6.101864732954709e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.6416435629435e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15329.0,
+      "completions/max_terminated_length": 15329.0,
+      "completions/mean_length": 6810.15625,
+      "completions/mean_terminated_length": 6810.15625,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.8957240954041481,
+      "epoch": 0.5611775528978841,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019385438645258546,
+      "learning_rate": 1e-05,
+      "loss": 0.0973,
+      "num_tokens": 537513876.0,
+      "reward": 0.328125,
+      "reward_std": 0.28011518716812134,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000025749206543,
+      "sampling/importance_sampling_ratio/min": 4.845474904868752e-05,
+      "sampling/sampling_logp_difference/max": 9.934880256652832,
+      "sampling/sampling_logp_difference/mean": 0.02021351456642151,
+      "step": 610
+    },
+    {
+      "clip_ratio/high_max": 1.4817902865615906e-05,
+      "clip_ratio/high_mean": 5.914362077419355e-06,
+      "clip_ratio/low_mean": 1.2616926369446446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8531288333178964e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16065.0,
+      "completions/mean_length": 6940.4140625,
+      "completions/mean_terminated_length": 6713.7685546875,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.8646975234150887,
+      "epoch": 0.562097516099356,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.001886329147964716,
+      "learning_rate": 1e-05,
+      "loss": 0.0319,
+      "num_tokens": 538419265.0,
+      "reward": 0.375,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000052452087402,
+      "sampling/importance_sampling_ratio/min": 6.893687327647058e-07,
+      "sampling/sampling_logp_difference/max": 14.18748950958252,
+      "sampling/sampling_logp_difference/mean": 0.019072774797677994,
+      "step": 611
+    },
+    {
+      "clip_ratio/high_max": 6.3681300161988474e-06,
+      "clip_ratio/high_mean": 1.5920325040497119e-06,
+      "clip_ratio/low_mean": 3.254086982451554e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4132902555938927e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15960.0,
+      "completions/mean_length": 7508.796875,
+      "completions/mean_terminated_length": 6995.35498046875,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 0.7723299860954285,
+      "epoch": 0.563017479300828,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002031022449955344,
+      "learning_rate": 1e-05,
+      "loss": 0.0335,
+      "num_tokens": 539399127.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2301519513130188,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0056421491317451,
+      "sampling/sampling_logp_difference/max": 5.177490234375,
+      "sampling/sampling_logp_difference/mean": 0.01832709088921547,
+      "step": 612
+    },
+    {
+      "clip_ratio/high_max": 1.5848977909627138e-05,
+      "clip_ratio/high_mean": 3.9622444774067844e-06,
+      "clip_ratio/low_mean": 2.6742804038804024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.070504851621081e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15816.0,
+      "completions/mean_length": 6019.6484375,
+      "completions/mean_terminated_length": 5938.03955078125,
+      "completions/min_length": 1020.0,
+      "completions/min_terminated_length": 1020.0,
+      "entropy": 0.7425512671470642,
+      "epoch": 0.5639374425022999,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003653773572295904,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 540189602.0,
+      "reward": 0.53125,
+      "reward_std": 0.26143303513526917,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999122619628906,
+      "sampling/importance_sampling_ratio/min": 0.005288486368954182,
+      "sampling/sampling_logp_difference/max": 5.242223262786865,
+      "sampling/sampling_logp_difference/mean": 0.017161473631858826,
+      "step": 613
+    },
+    {
+      "clip_ratio/high_max": 1.1017190900020069e-05,
+      "clip_ratio/high_mean": 2.754297725005017e-06,
+      "clip_ratio/low_mean": 3.428678644468164e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7041084169686656e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15861.0,
+      "completions/mean_length": 7155.6953125,
+      "completions/mean_terminated_length": 6621.826171875,
+      "completions/min_length": 987.0,
+      "completions/min_terminated_length": 987.0,
+      "entropy": 0.9789249897003174,
+      "epoch": 0.5648574057037719,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003739065257832408,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 541125587.0,
+      "reward": 0.265625,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999271631240845,
+      "sampling/importance_sampling_ratio/min": 9.236609002982732e-06,
+      "sampling/sampling_logp_difference/max": 11.59233570098877,
+      "sampling/sampling_logp_difference/mean": 0.02008877694606781,
+      "step": 614
+    },
+    {
+      "clip_ratio/high_max": 5.6091539590852335e-06,
+      "clip_ratio/high_mean": 2.4549021873099264e-06,
+      "clip_ratio/low_mean": 4.249646542575647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4951367613066395e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13553.0,
+      "completions/mean_length": 8027.359375,
+      "completions/mean_terminated_length": 7470.25048828125,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 0.9153474718332291,
+      "epoch": 0.5657773689052438,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0020656392443925142,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 542173801.0,
+      "reward": 0.2578125,
+      "reward_std": 0.22225633263587952,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999947190284729,
+      "sampling/importance_sampling_ratio/min": 0.00029620854184031487,
+      "sampling/sampling_logp_difference/max": 8.124446868896484,
+      "sampling/sampling_logp_difference/mean": 0.021495234221220016,
+      "step": 615
+    },
+    {
+      "clip_ratio/high_max": 1.7302586002188036e-05,
+      "clip_ratio/high_mean": 4.325646500547009e-06,
+      "clip_ratio/low_mean": 5.2193488272678223e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6519134659538395e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6115.3828125,
+      "completions/mean_terminated_length": 5952.38916015625,
+      "completions/min_length": 1158.0,
+      "completions/min_terminated_length": 1158.0,
+      "entropy": 0.751783661544323,
+      "epoch": 0.5666973321067157,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00824788399040699,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 542977266.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30616888403892517,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999478459358215,
+      "sampling/importance_sampling_ratio/min": 0.0013296925462782383,
+      "sampling/sampling_logp_difference/max": 6.622807502746582,
+      "sampling/sampling_logp_difference/mean": 0.017732972279191017,
+      "step": 616
+    },
+    {
+      "clip_ratio/high_max": 2.872588265745435e-05,
+      "clip_ratio/high_mean": 8.185486876755022e-06,
+      "clip_ratio/low_mean": 5.301810256241879e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.120358921180014e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15688.0,
+      "completions/mean_length": 7431.3203125,
+      "completions/mean_terminated_length": 7142.52392578125,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.9122852608561516,
+      "epoch": 0.5676172953081877,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005189655348658562,
+      "learning_rate": 1e-05,
+      "loss": 0.0613,
+      "num_tokens": 543947515.0,
+      "reward": 0.484375,
+      "reward_std": 0.21595832705497742,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.00017607140762265772,
+      "sampling/sampling_logp_difference/max": 8.644620895385742,
+      "sampling/sampling_logp_difference/mean": 0.02111673541367054,
+      "step": 617
+    },
+    {
+      "clip_ratio/high_max": 3.984698651038343e-06,
+      "clip_ratio/high_mean": 9.961746627595858e-07,
+      "clip_ratio/low_mean": 3.414959587644262e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.514577088026272e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16378.0,
+      "completions/mean_length": 5700.5546875,
+      "completions/mean_terminated_length": 5530.9765625,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8961661159992218,
+      "epoch": 0.5685372585096596,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004707770887762308,
+      "learning_rate": 1e-05,
+      "loss": 0.0773,
+      "num_tokens": 544694826.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998490214347839,
+      "sampling/importance_sampling_ratio/min": 5.211461817644647e-10,
+      "sampling/sampling_logp_difference/max": 21.374990463256836,
+      "sampling/sampling_logp_difference/mean": 0.018697837367653847,
+      "step": 618
+    },
+    {
+      "clip_ratio/high_max": 1.1809721399913542e-05,
+      "clip_ratio/high_mean": 2.9524303499783855e-06,
+      "clip_ratio/low_mean": 5.229935004535946e-05,
+      "clip_ratio/low_min": 4.098226327187149e-06,
+      "clip_ratio/region_mean": 5.525178062271152e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12422.0,
+      "completions/max_terminated_length": 12422.0,
+      "completions/mean_length": 4201.6796875,
+      "completions/mean_terminated_length": 4201.6796875,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "entropy": 0.7066933363676071,
+      "epoch": 0.5694572217111316,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.00980924628674984,
+      "learning_rate": 1e-05,
+      "loss": 0.0492,
+      "num_tokens": 545255377.0,
+      "reward": 0.5625,
+      "reward_std": 0.38664889335632324,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000074028968811,
+      "sampling/importance_sampling_ratio/min": 7.827866647858173e-05,
+      "sampling/sampling_logp_difference/max": 9.455235481262207,
+      "sampling/sampling_logp_difference/mean": 0.016301468014717102,
+      "step": 619
+    },
+    {
+      "clip_ratio/high_max": 6.093102456361521e-06,
+      "clip_ratio/high_mean": 1.5232756140903803e-06,
+      "clip_ratio/low_mean": 1.853809601470857e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0061371856172627e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13234.0,
+      "completions/mean_length": 5782.2578125,
+      "completions/mean_terminated_length": 5613.9765625,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 0.846621498465538,
+      "epoch": 0.5703771849126035,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005619424395263195,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 546013882.0,
+      "reward": 0.46875,
+      "reward_std": 0.2472364753484726,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000319480895996,
+      "sampling/importance_sampling_ratio/min": 9.447568299947307e-05,
+      "sampling/sampling_logp_difference/max": 9.267168045043945,
+      "sampling/sampling_logp_difference/mean": 0.018704919144511223,
+      "step": 620
+    },
+    {
+      "clip_ratio/high_max": 1.6747734207456233e-05,
+      "clip_ratio/high_mean": 4.186933551864058e-06,
+      "clip_ratio/low_mean": 4.008232758678787e-05,
+      "clip_ratio/low_min": 3.511630438879365e-06,
+      "clip_ratio/region_mean": 4.426926193445979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 7191.4921875,
+      "completions/mean_terminated_length": 7045.57958984375,
+      "completions/min_length": 1379.0,
+      "completions/min_terminated_length": 1379.0,
+      "entropy": 0.7846563309431076,
+      "epoch": 0.5712971481140754,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0063271005637943745,
+      "learning_rate": 1e-05,
+      "loss": 0.0964,
+      "num_tokens": 546954857.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999164342880249,
+      "sampling/importance_sampling_ratio/min": 0.006330032367259264,
+      "sampling/sampling_logp_difference/max": 5.062449932098389,
+      "sampling/sampling_logp_difference/mean": 0.01846012845635414,
+      "step": 621
+    },
+    {
+      "clip_ratio/high_max": 3.451678094279487e-05,
+      "clip_ratio/high_mean": 1.2486661603361426e-05,
+      "clip_ratio/low_mean": 5.253966105556174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.502632390947838e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15529.0,
+      "completions/max_terminated_length": 15529.0,
+      "completions/mean_length": 5491.7421875,
+      "completions/mean_terminated_length": 5491.7421875,
+      "completions/min_length": 1644.0,
+      "completions/min_terminated_length": 1644.0,
+      "entropy": 0.6960643380880356,
+      "epoch": 0.5722171113155474,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005836677737534046,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 547676024.0,
+      "reward": 0.5625,
+      "reward_std": 0.43213340640068054,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999930739402771,
+      "sampling/importance_sampling_ratio/min": 0.00043176248436793685,
+      "sampling/sampling_logp_difference/max": 7.7476348876953125,
+      "sampling/sampling_logp_difference/mean": 0.016565188765525818,
+      "step": 622
+    },
+    {
+      "clip_ratio/high_max": 4.318982973927632e-06,
+      "clip_ratio/high_mean": 1.079745743481908e-06,
+      "clip_ratio/low_mean": 3.0399249226320535e-05,
+      "clip_ratio/low_min": 5.838393462909153e-06,
+      "clip_ratio/region_mean": 3.147899496980244e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16179.0,
+      "completions/mean_length": 6993.125,
+      "completions/mean_terminated_length": 6844.06396484375,
+      "completions/min_length": 980.0,
+      "completions/min_terminated_length": 980.0,
+      "entropy": 0.8031502217054367,
+      "epoch": 0.5731370745170193,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00226933928206563,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 548590080.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19332444667816162,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 1.1417677114877733e-06,
+      "sampling/sampling_logp_difference/max": 13.68293285369873,
+      "sampling/sampling_logp_difference/mean": 0.01880657486617565,
+      "step": 623
+    },
+    {
+      "clip_ratio/high_max": 8.404208529100288e-06,
+      "clip_ratio/high_mean": 2.101052132275072e-06,
+      "clip_ratio/low_mean": 4.231840989632474e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.441946202859981e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15278.0,
+      "completions/max_terminated_length": 15278.0,
+      "completions/mean_length": 5602.8359375,
+      "completions/mean_terminated_length": 5602.8359375,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "entropy": 0.8287182524800301,
+      "epoch": 0.5740570377184913,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005067484453320503,
+      "learning_rate": 1e-05,
+      "loss": 0.0394,
+      "num_tokens": 549327251.0,
+      "reward": 0.5,
+      "reward_std": 0.35218530893325806,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701380729675,
+      "sampling/importance_sampling_ratio/min": 0.0036069792695343494,
+      "sampling/sampling_logp_difference/max": 5.624884605407715,
+      "sampling/sampling_logp_difference/mean": 0.018545404076576233,
+      "step": 624
+    },
+    {
+      "clip_ratio/high_max": 7.49742275729659e-06,
+      "clip_ratio/high_mean": 1.8743556893241475e-06,
+      "clip_ratio/low_mean": 4.6288066641864134e-05,
+      "clip_ratio/low_min": 5.32640206074575e-06,
+      "clip_ratio/region_mean": 4.816242244487512e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15901.0,
+      "completions/mean_length": 6747.0234375,
+      "completions/mean_terminated_length": 6671.1416015625,
+      "completions/min_length": 879.0,
+      "completions/min_terminated_length": 879.0,
+      "entropy": 0.8722762316465378,
+      "epoch": 0.5749770009199632,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023132911883294582,
+      "learning_rate": 1e-05,
+      "loss": 0.0064,
+      "num_tokens": 550208750.0,
+      "reward": 0.390625,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999475479125977,
+      "sampling/importance_sampling_ratio/min": 0.003727440955117345,
+      "sampling/sampling_logp_difference/max": 5.592033386230469,
+      "sampling/sampling_logp_difference/mean": 0.019216621294617653,
+      "step": 625
+    },
+    {
+      "clip_ratio/high_max": 7.693567567912396e-06,
+      "clip_ratio/high_mean": 1.923391891978099e-06,
+      "clip_ratio/low_mean": 6.517495285152108e-05,
+      "clip_ratio/low_min": 1.1217302017030306e-05,
+      "clip_ratio/region_mean": 6.709834497087286e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16027.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 6983.40625,
+      "completions/mean_terminated_length": 6983.40625,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 0.8781512826681137,
+      "epoch": 0.5758969641214351,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036700034979730844,
+      "learning_rate": 1e-05,
+      "loss": 0.0905,
+      "num_tokens": 551123002.0,
+      "reward": 0.328125,
+      "reward_std": 0.2419992983341217,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999868273735046,
+      "sampling/importance_sampling_ratio/min": 5.0360464229015633e-05,
+      "sampling/sampling_logp_difference/max": 9.8963041305542,
+      "sampling/sampling_logp_difference/mean": 0.019318291917443275,
+      "step": 626
+    },
+    {
+      "clip_ratio/high_max": 5.098295332572889e-06,
+      "clip_ratio/high_mean": 1.2745738331432221e-06,
+      "clip_ratio/low_mean": 5.9073974398415885e-05,
+      "clip_ratio/low_min": 6.781316187698394e-06,
+      "clip_ratio/region_mean": 6.034854845893278e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16201.0,
+      "completions/mean_length": 7143.671875,
+      "completions/mean_terminated_length": 6689.22900390625,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.7715872526168823,
+      "epoch": 0.5768169273229071,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036717690527439117,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 552055472.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2212003767490387,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998798966407776,
+      "sampling/importance_sampling_ratio/min": 0.00012340980174485594,
+      "sampling/sampling_logp_difference/max": 9.0,
+      "sampling/sampling_logp_difference/mean": 0.018518533557653427,
+      "step": 627
+    },
+    {
+      "clip_ratio/high_max": 1.778747127900715e-05,
+      "clip_ratio/high_mean": 4.4468678197517875e-06,
+      "clip_ratio/low_mean": 2.460010267668622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9046970439594588e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15729.0,
+      "completions/mean_length": 6558.5859375,
+      "completions/mean_terminated_length": 6075.36865234375,
+      "completions/min_length": 1061.0,
+      "completions/min_terminated_length": 1061.0,
+      "entropy": 0.9016438648104668,
+      "epoch": 0.577736890524379,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0019187588477507234,
+      "learning_rate": 1e-05,
+      "loss": 0.0494,
+      "num_tokens": 552914275.0,
+      "reward": 0.484375,
+      "reward_std": 0.2041158676147461,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999418258666992,
+      "sampling/importance_sampling_ratio/min": 0.00011496193474158645,
+      "sampling/sampling_logp_difference/max": 9.07090950012207,
+      "sampling/sampling_logp_difference/mean": 0.01948089525103569,
+      "step": 628
+    },
+    {
+      "clip_ratio/high_max": 1.383282506139949e-05,
+      "clip_ratio/high_mean": 3.4582062653498724e-06,
+      "clip_ratio/low_mean": 4.3287541757308645e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.674574802265852e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15812.0,
+      "completions/max_terminated_length": 15812.0,
+      "completions/mean_length": 6150.2734375,
+      "completions/mean_terminated_length": 6150.2734375,
+      "completions/min_length": 596.0,
+      "completions/min_terminated_length": 596.0,
+      "entropy": 0.8385711833834648,
+      "epoch": 0.578656853725851,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003598993644118309,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 553719958.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999948740005493,
+      "sampling/importance_sampling_ratio/min": 0.000830297009088099,
+      "sampling/sampling_logp_difference/max": 7.093727111816406,
+      "sampling/sampling_logp_difference/mean": 0.019557828083634377,
+      "step": 629
+    },
+    {
+      "clip_ratio/high_max": 2.668830120455823e-06,
+      "clip_ratio/high_mean": 6.672075301139557e-07,
+      "clip_ratio/low_mean": 1.7461135655594262e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8128343185708218e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 8142.46875,
+      "completions/mean_terminated_length": 7519.16015625,
+      "completions/min_length": 1828.0,
+      "completions/min_terminated_length": 1828.0,
+      "entropy": 0.8508284538984299,
+      "epoch": 0.5795768169273229,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.002453390508890152,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 554784458.0,
+      "reward": 0.390625,
+      "reward_std": 0.1422954648733139,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999715089797974,
+      "sampling/importance_sampling_ratio/min": 0.0002036939695244655,
+      "sampling/sampling_logp_difference/max": 8.498891830444336,
+      "sampling/sampling_logp_difference/mean": 0.019445519894361496,
+      "step": 630
+    },
+    {
+      "clip_ratio/high_max": 1.9002460248884745e-05,
+      "clip_ratio/high_mean": 4.750615062221186e-06,
+      "clip_ratio/low_mean": 3.1556500402984966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630711614732718e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7665.921875,
+      "completions/mean_terminated_length": 7384.693359375,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.7667205557227135,
+      "epoch": 0.5804967801287948,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027936683036386967,
+      "learning_rate": 1e-05,
+      "loss": 0.0245,
+      "num_tokens": 555783296.0,
+      "reward": 0.4296875,
+      "reward_std": 0.24435830116271973,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998488426208496,
+      "sampling/importance_sampling_ratio/min": 0.0002781523216981441,
+      "sampling/sampling_logp_difference/max": 8.187341690063477,
+      "sampling/sampling_logp_difference/mean": 0.01912892609834671,
+      "step": 631
+    },
+    {
+      "clip_ratio/high_max": 1.5569996094200178e-05,
+      "clip_ratio/high_mean": 3.8924990235500445e-06,
+      "clip_ratio/low_mean": 3.8605214058407e-05,
+      "clip_ratio/low_min": 6.2870940382708795e-06,
+      "clip_ratio/region_mean": 4.249771222930576e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 7266.171875,
+      "completions/mean_terminated_length": 6972.04833984375,
+      "completions/min_length": 1117.0,
+      "completions/min_terminated_length": 1117.0,
+      "entropy": 0.7114122956991196,
+      "epoch": 0.5814167433302668,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004213637672364712,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 556732942.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3135277032852173,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999159574508667,
+      "sampling/importance_sampling_ratio/min": 1.760348027346481e-06,
+      "sampling/sampling_logp_difference/max": 13.249999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01689826510846615,
+      "step": 632
+    },
+    {
+      "clip_ratio/high_max": 2.1737864472015644e-05,
+      "clip_ratio/high_mean": 5.434466118003911e-06,
+      "clip_ratio/low_mean": 3.640393322257296e-05,
+      "clip_ratio/low_min": 3.0146634344419e-06,
+      "clip_ratio/region_mean": 4.183839985216764e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6532.9921875,
+      "completions/mean_terminated_length": 6296.568359375,
+      "completions/min_length": 757.0,
+      "completions/min_terminated_length": 757.0,
+      "entropy": 0.7711968123912811,
+      "epoch": 0.5823367065317387,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004169877618551254,
+      "learning_rate": 1e-05,
+      "loss": 0.0406,
+      "num_tokens": 557589141.0,
+      "reward": 0.546875,
+      "reward_std": 0.2675113081932068,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999022483825684,
+      "sampling/importance_sampling_ratio/min": 4.499705482885474e-06,
+      "sampling/sampling_logp_difference/max": 12.311498641967773,
+      "sampling/sampling_logp_difference/mean": 0.018738210201263428,
+      "step": 633
+    },
+    {
+      "clip_ratio/high_max": 6.099523716329713e-06,
+      "clip_ratio/high_mean": 1.5248809290824283e-06,
+      "clip_ratio/low_mean": 6.070675681257853e-05,
+      "clip_ratio/low_min": 5.175126261747209e-06,
+      "clip_ratio/region_mean": 6.223163745744387e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16337.0,
+      "completions/mean_length": 7384.3203125,
+      "completions/mean_terminated_length": 7168.328125,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "entropy": 0.8054972141981125,
+      "epoch": 0.5832566697332107,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032470994628965855,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 558557286.0,
+      "reward": 0.4140625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999680519104004,
+      "sampling/importance_sampling_ratio/min": 0.00019634375348687172,
+      "sampling/sampling_logp_difference/max": 8.535643577575684,
+      "sampling/sampling_logp_difference/mean": 0.019018521532416344,
+      "step": 634
+    },
+    {
+      "clip_ratio/high_max": 4.436853964762122e-05,
+      "clip_ratio/high_mean": 1.1092134911905305e-05,
+      "clip_ratio/low_mean": 3.798940008437057e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.908153437099827e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15918.0,
+      "completions/mean_length": 6131.9453125,
+      "completions/mean_terminated_length": 6051.22021484375,
+      "completions/min_length": 820.0,
+      "completions/min_terminated_length": 820.0,
+      "entropy": 0.8365718051791191,
+      "epoch": 0.5841766329346826,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004848263692110777,
+      "learning_rate": 1e-05,
+      "loss": 0.1247,
+      "num_tokens": 559364639.0,
+      "reward": 0.5625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000056266784668,
+      "sampling/importance_sampling_ratio/min": 5.424115443020128e-06,
+      "sampling/sampling_logp_difference/max": 12.124655723571777,
+      "sampling/sampling_logp_difference/mean": 0.018360167741775513,
+      "step": 635
+    },
+    {
+      "clip_ratio/high_max": 1.9398633412492927e-05,
+      "clip_ratio/high_mean": 4.849658353123232e-06,
+      "clip_ratio/low_mean": 2.7543567512111622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239322609260853e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15724.0,
+      "completions/max_terminated_length": 15724.0,
+      "completions/mean_length": 5746.8828125,
+      "completions/mean_terminated_length": 5746.8828125,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "entropy": 0.6247628927230835,
+      "epoch": 0.5850965961361545,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003403177484869957,
+      "learning_rate": 1e-05,
+      "loss": 0.0279,
+      "num_tokens": 560119248.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999486207962036,
+      "sampling/importance_sampling_ratio/min": 6.475952432083432e-07,
+      "sampling/sampling_logp_difference/max": 14.25,
+      "sampling/sampling_logp_difference/mean": 0.015006184577941895,
+      "step": 636
+    },
+    {
+      "clip_ratio/high_max": 2.857848289750109e-05,
+      "clip_ratio/high_mean": 8.111364707019675e-06,
+      "clip_ratio/low_mean": 4.927243321617425e-05,
+      "clip_ratio/low_min": 5.929088274569949e-06,
+      "clip_ratio/region_mean": 5.738379809372418e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7313.7890625,
+      "completions/mean_terminated_length": 7096.1044921875,
+      "completions/min_length": 1068.0,
+      "completions/min_terminated_length": 1068.0,
+      "entropy": 0.8606570512056351,
+      "epoch": 0.5860165593376265,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004058506805449724,
+      "learning_rate": 1e-05,
+      "loss": 0.093,
+      "num_tokens": 561072493.0,
+      "reward": 0.375,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0006621598731726408,
+      "sampling/sampling_logp_difference/max": 7.320003509521484,
+      "sampling/sampling_logp_difference/mean": 0.01940958946943283,
+      "step": 637
+    },
+    {
+      "clip_ratio/high_max": 2.7213282010052353e-05,
+      "clip_ratio/high_mean": 7.758043807370996e-06,
+      "clip_ratio/low_mean": 4.890350828645751e-05,
+      "clip_ratio/low_min": 3.968002147303196e-06,
+      "clip_ratio/region_mean": 5.666155129802064e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16093.0,
+      "completions/mean_length": 7495.5078125,
+      "completions/mean_terminated_length": 7425.51953125,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8225502669811249,
+      "epoch": 0.5869365225390984,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002768489997833967,
+      "learning_rate": 1e-05,
+      "loss": 0.098,
+      "num_tokens": 562048734.0,
+      "reward": 0.3671875,
+      "reward_std": 0.344813734292984,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 1.4612716768169776e-05,
+      "sampling/sampling_logp_difference/max": 11.133618354797363,
+      "sampling/sampling_logp_difference/mean": 0.0189508069306612,
+      "step": 638
+    },
+    {
+      "clip_ratio/high_max": 2.5246594077543705e-05,
+      "clip_ratio/high_mean": 6.311648519385926e-06,
+      "clip_ratio/low_mean": 4.9131452101391915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.544310107552519e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 6856.5703125,
+      "completions/mean_terminated_length": 6627.912109375,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "entropy": 0.8542520478367805,
+      "epoch": 0.5878564857405704,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002966079628095031,
+      "learning_rate": 1e-05,
+      "loss": 0.0507,
+      "num_tokens": 562945623.0,
+      "reward": 0.40625,
+      "reward_std": 0.3016803562641144,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998261332511902,
+      "sampling/importance_sampling_ratio/min": 0.0001795661955839023,
+      "sampling/sampling_logp_difference/max": 8.624966621398926,
+      "sampling/sampling_logp_difference/mean": 0.019664689898490906,
+      "step": 639
+    },
+    {
+      "clip_ratio/high_max": 1.2127683930884814e-05,
+      "clip_ratio/high_mean": 5.316983106240514e-06,
+      "clip_ratio/low_mean": 4.154238490627904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.685936778514588e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15231.0,
+      "completions/mean_length": 6463.2421875,
+      "completions/mean_terminated_length": 6305.77001953125,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "entropy": 0.8427078947424889,
+      "epoch": 0.5887764489420423,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021058651618659496,
+      "learning_rate": 1e-05,
+      "loss": 0.0164,
+      "num_tokens": 563789214.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24541424214839935,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998518824577332,
+      "sampling/importance_sampling_ratio/min": 0.00043074542190879583,
+      "sampling/sampling_logp_difference/max": 7.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01898353546857834,
+      "step": 640
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 563789214,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-640/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-640/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/README.md b/dapo_milora_plus_20251201_131939/checkpoint-704/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-704/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/adapter_config.json
@@ -0,0 +1,40 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-704/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/latest b/dapo_milora_plus_20251201_131939/checkpoint-704/latest
new file mode 100644
index 0000000000000000000000000000000000000000..a467c93394af75577cc1648673b23e2ec8a3f7c8
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/latest
@@ -0,0 +1 @@
+global_step704
\ No newline at end of file
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-704/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-704/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-704/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..83274adc85c541af911ecf7654656966957f9fcd
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/trainer_state.json
@@ -0,0 +1,21858 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6476540938362465,
+  "eval_steps": 500,
+  "global_step": 704,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15689.0,
+      "completions/max_terminated_length": 15689.0,
+      "completions/mean_length": 6039.171875,
+      "completions/mean_terminated_length": 6039.171875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 1.19118632376194,
+      "epoch": 0.0009199632014719411,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004114801995456219,
+      "learning_rate": 1e-05,
+      "loss": 0.0591,
+      "num_tokens": 792270.0,
+      "reward": 0.25,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999940395355225,
+      "sampling/importance_sampling_ratio/min": 0.0002457273658365011,
+      "sampling/sampling_logp_difference/max": 8.311287879943848,
+      "sampling/sampling_logp_difference/mean": 0.021642697975039482,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 4.125957275391556e-06,
+      "clip_ratio/high_mean": 1.031489318847889e-06,
+      "clip_ratio/low_mean": 5.146006606082665e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.249155537967454e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15112.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 4978.265625,
+      "completions/mean_terminated_length": 4978.265625,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.9862165078520775,
+      "epoch": 0.0018399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004017667844891548,
+      "learning_rate": 1e-05,
+      "loss": 0.0407,
+      "num_tokens": 1452816.0,
+      "reward": 0.3203125,
+      "reward_std": 0.30798622965812683,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999986290931702,
+      "sampling/importance_sampling_ratio/min": 0.00840891432017088,
+      "sampling/sampling_logp_difference/max": 4.778462886810303,
+      "sampling/sampling_logp_difference/mean": 0.01879144087433815,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 5.936832167208195e-06,
+      "clip_ratio/high_mean": 1.4842080418020487e-06,
+      "clip_ratio/low_mean": 5.7621912446848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.910612048865005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16118.0,
+      "completions/mean_length": 6664.3046875,
+      "completions/mean_terminated_length": 6587.771484375,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.9934953600168228,
+      "epoch": 0.0027598896044158236,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002359058242291212,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 2324415.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000684261322021,
+      "sampling/importance_sampling_ratio/min": 0.0018158734310418367,
+      "sampling/sampling_logp_difference/max": 6.311188697814941,
+      "sampling/sampling_logp_difference/mean": 0.02111843228340149,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 1.0255745564791141e-05,
+      "clip_ratio/high_mean": 2.5639363911977853e-06,
+      "clip_ratio/low_mean": 2.648322629283939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9047162797724013e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15915.0,
+      "completions/mean_length": 5801.203125,
+      "completions/mean_terminated_length": 5717.8740234375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 1.0870511680841446,
+      "epoch": 0.0036798528058877645,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002563449554145336,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 3091369.0,
+      "reward": 0.2734375,
+      "reward_std": 0.2580180764198303,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000073790550232,
+      "sampling/importance_sampling_ratio/min": 0.018811559304594994,
+      "sampling/sampling_logp_difference/max": 3.9732837677001953,
+      "sampling/sampling_logp_difference/mean": 0.021363306790590286,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 9.68160156844533e-06,
+      "clip_ratio/high_mean": 2.4204003921113326e-06,
+      "clip_ratio/low_mean": 4.577123684157414e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8191637006311794e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15767.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 5696.4140625,
+      "completions/mean_terminated_length": 5696.4140625,
+      "completions/min_length": 539.0,
+      "completions/min_terminated_length": 539.0,
+      "entropy": 1.1476548686623573,
+      "epoch": 0.004599816007359705,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0025258746463805437,
+      "learning_rate": 1e-05,
+      "loss": -0.0344,
+      "num_tokens": 3841078.0,
+      "reward": 0.3046875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910295009613,
+      "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06,
+      "sampling/sampling_logp_difference/max": 13.41861629486084,
+      "sampling/sampling_logp_difference/mean": 0.020693503320217133,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 3.660332322397153e-05,
+      "clip_ratio/high_mean": 1.029715701861278e-05,
+      "clip_ratio/low_mean": 3.895585894042597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.925301630009926e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14489.0,
+      "completions/mean_length": 5280.890625,
+      "completions/mean_terminated_length": 5104.65087890625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.8976912423968315,
+      "epoch": 0.005519779208831647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0050104837864637375,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 4535640.0,
+      "reward": 0.359375,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998958110809326,
+      "sampling/importance_sampling_ratio/min": 0.0007187551236711442,
+      "sampling/sampling_logp_difference/max": 7.237989902496338,
+      "sampling/sampling_logp_difference/mean": 0.018597707152366638,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 4.484465989662567e-06,
+      "clip_ratio/high_mean": 1.1211164974156418e-06,
+      "clip_ratio/low_mean": 2.823482634539687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9355942729125672e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16166.0,
+      "completions/mean_length": 6874.9453125,
+      "completions/mean_terminated_length": 6568.20166015625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 1.0286128222942352,
+      "epoch": 0.006439742410303588,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0018693821039050817,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "num_tokens": 5434801.0,
+      "reward": 0.203125,
+      "reward_std": 0.21778544783592224,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999491572380066,
+      "sampling/importance_sampling_ratio/min": 5.279039783090411e-07,
+      "sampling/sampling_logp_difference/max": 14.454351425170898,
+      "sampling/sampling_logp_difference/mean": 0.020383886992931366,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 2.5703585606606794e-05,
+      "clip_ratio/high_mean": 7.537758676789963e-06,
+      "clip_ratio/low_mean": 5.802649661745818e-05,
+      "clip_ratio/low_min": 6.0229353948670905e-06,
+      "clip_ratio/region_mean": 6.556425523740472e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15910.0,
+      "completions/mean_length": 5270.5234375,
+      "completions/mean_terminated_length": 5094.119140625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "entropy": 1.0461085885763168,
+      "epoch": 0.007359705611775529,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005809026304632425,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "num_tokens": 6128708.0,
+      "reward": 0.3359375,
+      "reward_std": 0.37320882081985474,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 8.339863597939257e-06,
+      "sampling/sampling_logp_difference/max": 11.694463729858398,
+      "sampling/sampling_logp_difference/mean": 0.02038305625319481,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 3.965832502217381e-05,
+      "clip_ratio/high_mean": 1.2004183304270555e-05,
+      "clip_ratio/low_mean": 2.037043998370791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.237462271954428e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14850.0,
+      "completions/mean_length": 4524.6796875,
+      "completions/mean_terminated_length": 4431.29931640625,
+      "completions/min_length": 60.0,
+      "completions/min_terminated_length": 60.0,
+      "entropy": 0.8275458142161369,
+      "epoch": 0.00827966881324747,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002717240946367383,
+      "learning_rate": 1e-05,
+      "loss": 0.1005,
+      "num_tokens": 6726587.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32325831055641174,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.0002034705103142187,
+      "sampling/sampling_logp_difference/max": 8.49998950958252,
+      "sampling/sampling_logp_difference/mean": 0.017633724957704544,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 7.08802053850377e-06,
+      "clip_ratio/high_mean": 1.7720051346259424e-06,
+      "clip_ratio/low_mean": 3.394487077912345e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.571687602743623e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15243.0,
+      "completions/mean_length": 5129.171875,
+      "completions/mean_terminated_length": 4950.52392578125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.7103187441825867,
+      "epoch": 0.00919963201471941,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005626584868878126,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 7400273.0,
+      "reward": 0.6796875,
+      "reward_std": 0.379814088344574,
+      "rewards/accuracy_reward/mean": 0.6796875,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000073909759521,
+      "sampling/importance_sampling_ratio/min": 0.002478840760886669,
+      "sampling/sampling_logp_difference/max": 5.999964237213135,
+      "sampling/sampling_logp_difference/mean": 0.016138140112161636,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.9378599517949624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9378599517949624e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15482.0,
+      "completions/max_terminated_length": 15482.0,
+      "completions/mean_length": 4741.296875,
+      "completions/mean_terminated_length": 4741.296875,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.903806746006012,
+      "epoch": 0.010119595216191352,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0022279289551079273,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 8026991.0,
+      "reward": 0.3203125,
+      "reward_std": 0.1701665222644806,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.00012343087291810662,
+      "sampling/sampling_logp_difference/max": 8.999829292297363,
+      "sampling/sampling_logp_difference/mean": 0.01844138652086258,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 4.8331594371120445e-06,
+      "clip_ratio/high_mean": 1.93793562175415e-06,
+      "clip_ratio/low_mean": 4.368338659332949e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.562132153296261e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 5855.5546875,
+      "completions/mean_terminated_length": 5602.8720703125,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 1.047883652150631,
+      "epoch": 0.011039558417663294,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038395742885768414,
+      "learning_rate": 1e-05,
+      "loss": 0.1298,
+      "num_tokens": 8797134.0,
+      "reward": 0.421875,
+      "reward_std": 0.3503503203392029,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999397397041321,
+      "sampling/importance_sampling_ratio/min": 0.0007607790757901967,
+      "sampling/sampling_logp_difference/max": 7.1811676025390625,
+      "sampling/sampling_logp_difference/mean": 0.02074582129716873,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 3.0723854251846205e-06,
+      "clip_ratio/high_mean": 7.680963562961551e-07,
+      "clip_ratio/low_mean": 1.2482652891776524e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.325074924807268e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15365.0,
+      "completions/mean_length": 6816.6953125,
+      "completions/mean_terminated_length": 6664.83349609375,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "entropy": 1.1763990670442581,
+      "epoch": 0.011959521619135235,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0011414350010454655,
+      "learning_rate": 1e-05,
+      "loss": 0.0197,
+      "num_tokens": 9691639.0,
+      "reward": 0.25,
+      "reward_std": 0.1354655921459198,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998582601547241,
+      "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08,
+      "sampling/sampling_logp_difference/max": 17.785776138305664,
+      "sampling/sampling_logp_difference/mean": 0.021673155948519707,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 1.3825085034113727e-05,
+      "clip_ratio/high_mean": 3.4562712585284316e-06,
+      "clip_ratio/low_mean": 2.299899915669812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6455270244696294e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15459.0,
+      "completions/max_terminated_length": 15459.0,
+      "completions/mean_length": 5313.53125,
+      "completions/mean_terminated_length": 5313.53125,
+      "completions/min_length": 509.0,
+      "completions/min_terminated_length": 509.0,
+      "entropy": 1.0474217981100082,
+      "epoch": 0.012879484820607176,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004565369803458452,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "num_tokens": 10391515.0,
+      "reward": 0.296875,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998889565467834,
+      "sampling/importance_sampling_ratio/min": 2.431661960144993e-05,
+      "sampling/sampling_logp_difference/max": 10.624350547790527,
+      "sampling/sampling_logp_difference/mean": 0.020862173289060593,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 1.1656098649837077e-05,
+      "clip_ratio/high_mean": 2.914024662459269e-06,
+      "clip_ratio/low_mean": 6.22073393969913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.512136405945057e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14724.0,
+      "completions/mean_length": 4732.578125,
+      "completions/mean_terminated_length": 4640.83447265625,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "entropy": 1.0815455242991447,
+      "epoch": 0.013799448022079117,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006024828180670738,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 11017781.0,
+      "reward": 0.25,
+      "reward_std": 0.2959064245223999,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999477863311768,
+      "sampling/importance_sampling_ratio/min": 0.00306904804892838,
+      "sampling/sampling_logp_difference/max": 5.786387920379639,
+      "sampling/sampling_logp_difference/mean": 0.020809629932045937,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 5.413130111264763e-06,
+      "clip_ratio/high_mean": 1.3532825278161908e-06,
+      "clip_ratio/low_mean": 2.816210690070875e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.951538942852494e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15673.0,
+      "completions/mean_length": 5931.4296875,
+      "completions/mean_terminated_length": 5849.1259765625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "entropy": 1.0069087892770767,
+      "epoch": 0.014719411223551058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036383175756782293,
+      "learning_rate": 1e-05,
+      "loss": -0.0001,
+      "num_tokens": 11794972.0,
+      "reward": 0.1875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999954342842102,
+      "sampling/importance_sampling_ratio/min": 0.00028886934160254896,
+      "sampling/sampling_logp_difference/max": 8.1495361328125,
+      "sampling/sampling_logp_difference/mean": 0.019794823601841927,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.421858264118782e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.421858264118782e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14139.0,
+      "completions/mean_length": 5473.6171875,
+      "completions/mean_terminated_length": 5387.70849609375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "entropy": 1.0765233263373375,
+      "epoch": 0.015639374425023,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004312732256948948,
+      "learning_rate": 1e-05,
+      "loss": 0.0478,
+      "num_tokens": 12517443.0,
+      "reward": 0.2578125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999938428401947,
+      "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07,
+      "sampling/sampling_logp_difference/max": 16.095191955566406,
+      "sampling/sampling_logp_difference/mean": 0.020093362778425217,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 2.0872469121968606e-05,
+      "clip_ratio/high_mean": 5.218117280492152e-06,
+      "clip_ratio/low_mean": 4.733878370188904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.255690120975487e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6617.7578125,
+      "completions/mean_terminated_length": 6137.45068359375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.8550976514816284,
+      "epoch": 0.01655933762649494,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021260723005980253,
+      "learning_rate": 1e-05,
+      "loss": 0.1382,
+      "num_tokens": 13384420.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999901294708252,
+      "sampling/importance_sampling_ratio/min": 0.02929881028831005,
+      "sampling/sampling_logp_difference/max": 3.5302083492279053,
+      "sampling/sampling_logp_difference/mean": 0.01808803342282772,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 6.404673058568733e-06,
+      "clip_ratio/high_mean": 1.6011682646421832e-06,
+      "clip_ratio/low_mean": 3.2195434073400975e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.379660131486162e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14382.0,
+      "completions/mean_length": 5285.7578125,
+      "completions/mean_terminated_length": 5109.595703125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.8321448192000389,
+      "epoch": 0.017479300827966882,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003563448553904891,
+      "learning_rate": 1e-05,
+      "loss": 0.0001,
+      "num_tokens": 14081197.0,
+      "reward": 0.375,
+      "reward_std": 0.31116873025894165,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998770952224731,
+      "sampling/importance_sampling_ratio/min": 0.000519682711455971,
+      "sampling/sampling_logp_difference/max": 7.562292098999023,
+      "sampling/sampling_logp_difference/mean": 0.017500173300504684,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.128390534991922e-05,
+      "clip_ratio/low_min": 1.2459845038392814e-05,
+      "clip_ratio/region_mean": 5.128390534991922e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13991.0,
+      "completions/max_terminated_length": 13991.0,
+      "completions/mean_length": 4918.1953125,
+      "completions/mean_terminated_length": 4918.1953125,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "entropy": 0.9329824000597,
+      "epoch": 0.01839926402943882,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0048850164748728275,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 14727798.0,
+      "reward": 0.359375,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999402165412903,
+      "sampling/importance_sampling_ratio/min": 0.00041761461761780083,
+      "sampling/sampling_logp_difference/max": 7.780951499938965,
+      "sampling/sampling_logp_difference/mean": 0.01855182647705078,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 1.0364761692471802e-05,
+      "clip_ratio/high_mean": 2.5911904231179506e-06,
+      "clip_ratio/low_mean": 3.091395433330035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350514430167095e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16202.0,
+      "completions/max_terminated_length": 16202.0,
+      "completions/mean_length": 5268.5234375,
+      "completions/mean_terminated_length": 5268.5234375,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 1.1676538437604904,
+      "epoch": 0.019319227230910764,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030562332831323147,
+      "learning_rate": 1e-05,
+      "loss": 0.0443,
+      "num_tokens": 15421937.0,
+      "reward": 0.28125,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000249147415161,
+      "sampling/importance_sampling_ratio/min": 0.0016021198825910687,
+      "sampling/sampling_logp_difference/max": 6.436427593231201,
+      "sampling/sampling_logp_difference/mean": 0.021109789609909058,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 1.6653621514706174e-05,
+      "clip_ratio/high_mean": 4.1634053786765435e-06,
+      "clip_ratio/low_mean": 3.064284169340681e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.480624718577019e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15940.0,
+      "completions/mean_length": 5361.0703125,
+      "completions/mean_terminated_length": 5186.103515625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "entropy": 0.9569757729768753,
+      "epoch": 0.020239190432382703,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003777366131544113,
+      "learning_rate": 1e-05,
+      "loss": 0.0058,
+      "num_tokens": 16128698.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26409637928009033,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999063611030579,
+      "sampling/importance_sampling_ratio/min": 0.004354433622211218,
+      "sampling/sampling_logp_difference/max": 5.43656063079834,
+      "sampling/sampling_logp_difference/mean": 0.01940997503697872,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 3.227977140340954e-05,
+      "clip_ratio/high_mean": 9.227950727108691e-06,
+      "clip_ratio/low_mean": 4.881033578385541e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8038286169903586e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15623.0,
+      "completions/mean_length": 7428.3125,
+      "completions/mean_terminated_length": 6910.21435546875,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 1.0387683138251305,
+      "epoch": 0.021159153633854646,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005749945063143969,
+      "learning_rate": 1e-05,
+      "loss": 0.0466,
+      "num_tokens": 17101202.0,
+      "reward": 0.2734375,
+      "reward_std": 0.33114415407180786,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.010671229101717472,
+      "sampling/sampling_logp_difference/max": 4.540204048156738,
+      "sampling/sampling_logp_difference/mean": 0.021208524703979492,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 2.544114977354184e-06,
+      "clip_ratio/high_mean": 6.36028744338546e-07,
+      "clip_ratio/low_mean": 4.543399086287536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6070018697719206e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15659.0,
+      "completions/mean_length": 5462.203125,
+      "completions/mean_terminated_length": 5288.841796875,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 1.088257022202015,
+      "epoch": 0.02207911683532659,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005364824552088976,
+      "learning_rate": 1e-05,
+      "loss": -0.0106,
+      "num_tokens": 17820796.0,
+      "reward": 0.1953125,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.1953125,
+      "rewards/accuracy_reward/std": 0.3979988098144531,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 0.0013558369828388095,
+      "sampling/sampling_logp_difference/max": 6.603336334228516,
+      "sampling/sampling_logp_difference/mean": 0.020104583352804184,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 1.132360557676293e-05,
+      "clip_ratio/high_mean": 2.8309013941907324e-06,
+      "clip_ratio/low_mean": 3.686837260374887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.96992739979396e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 5423.234375,
+      "completions/mean_terminated_length": 5249.25439453125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 0.9123491793870926,
+      "epoch": 0.022999080036798528,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002392752794548869,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 18538546.0,
+      "reward": 0.3125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999918341636658,
+      "sampling/importance_sampling_ratio/min": 1.657394705034676e-06,
+      "sampling/sampling_logp_difference/max": 13.310263633728027,
+      "sampling/sampling_logp_difference/mean": 0.02011517994105816,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 2.127026391463005e-05,
+      "clip_ratio/high_mean": 6.648429234701325e-06,
+      "clip_ratio/low_mean": 1.4927492088645522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.157592166440736e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13978.0,
+      "completions/mean_length": 5574.1640625,
+      "completions/mean_terminated_length": 5489.04736328125,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 1.0090710371732712,
+      "epoch": 0.02391904323827047,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036540210712701082,
+      "learning_rate": 1e-05,
+      "loss": 0.0036,
+      "num_tokens": 19270439.0,
+      "reward": 0.3515625,
+      "reward_std": 0.23646268248558044,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 0.00021918962011113763,
+      "sampling/sampling_logp_difference/max": 8.425573348999023,
+      "sampling/sampling_logp_difference/mean": 0.02006707340478897,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 1.1303152405162109e-05,
+      "clip_ratio/high_mean": 2.8257881012905273e-06,
+      "clip_ratio/low_mean": 2.827990363130084e-05,
+      "clip_ratio/low_min": 5.86744272368378e-06,
+      "clip_ratio/region_mean": 3.1105691391530854e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15061.0,
+      "completions/mean_length": 6204.75,
+      "completions/mean_terminated_length": 6124.5986328125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 0.957111045718193,
+      "epoch": 0.02483900643974241,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006005869247019291,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 20083655.0,
+      "reward": 0.3046875,
+      "reward_std": 0.31616854667663574,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999549388885498,
+      "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08,
+      "sampling/sampling_logp_difference/max": 18.249685287475586,
+      "sampling/sampling_logp_difference/mean": 0.0189923457801342,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 8.289213610623847e-06,
+      "clip_ratio/high_mean": 2.0723034026559617e-06,
+      "clip_ratio/low_mean": 3.4569659419503296e-05,
+      "clip_ratio/low_min": 3.6480373637459707e-06,
+      "clip_ratio/region_mean": 3.664196310637635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15274.0,
+      "completions/mean_length": 5659.5703125,
+      "completions/mean_terminated_length": 5489.341796875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "entropy": 0.9482033550739288,
+      "epoch": 0.025758969641214352,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004129618871957064,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 20829064.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3745690584182739,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999293088912964,
+      "sampling/importance_sampling_ratio/min": 4.007936149719171e-05,
+      "sampling/sampling_logp_difference/max": 10.124649047851562,
+      "sampling/sampling_logp_difference/mean": 0.019232336431741714,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 1.3534072877519066e-05,
+      "clip_ratio/high_mean": 3.3835182193797664e-06,
+      "clip_ratio/low_mean": 2.9090757720950933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.247427605401754e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14558.0,
+      "completions/mean_length": 5122.9609375,
+      "completions/mean_terminated_length": 5034.29150390625,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 1.020588956773281,
+      "epoch": 0.02667893284268629,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004814058542251587,
+      "learning_rate": 1e-05,
+      "loss": 0.0994,
+      "num_tokens": 21505483.0,
+      "reward": 0.3359375,
+      "reward_std": 0.31930169463157654,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.0003798597026616335,
+      "sampling/sampling_logp_difference/max": 7.87570858001709,
+      "sampling/sampling_logp_difference/mean": 0.019156761467456818,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 2.0903964468743652e-05,
+      "clip_ratio/high_mean": 5.225991117185913e-06,
+      "clip_ratio/low_mean": 4.13707307416189e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.659672185880481e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15926.0,
+      "completions/max_terminated_length": 15926.0,
+      "completions/mean_length": 4833.734375,
+      "completions/mean_terminated_length": 4833.734375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 1.0276868790388107,
+      "epoch": 0.027598896044158234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006946730427443981,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 22142657.0,
+      "reward": 0.421875,
+      "reward_std": 0.250127375125885,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998937845230103,
+      "sampling/importance_sampling_ratio/min": 0.000452048028819263,
+      "sampling/sampling_logp_difference/max": 7.701722145080566,
+      "sampling/sampling_logp_difference/mean": 0.019841451197862625,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 8.514986802765634e-06,
+      "clip_ratio/high_mean": 2.1287467006914085e-06,
+      "clip_ratio/low_mean": 3.9484380408794095e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.161312688211183e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15122.0,
+      "completions/mean_length": 6895.390625,
+      "completions/mean_terminated_length": 6589.30615234375,
+      "completions/min_length": 729.0,
+      "completions/min_terminated_length": 729.0,
+      "entropy": 1.1640124469995499,
+      "epoch": 0.028518859245630176,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0040768519975245,
+      "learning_rate": 1e-05,
+      "loss": 0.0397,
+      "num_tokens": 23045931.0,
+      "reward": 0.1484375,
+      "reward_std": 0.20175683498382568,
+      "rewards/accuracy_reward/mean": 0.1484375,
+      "rewards/accuracy_reward/std": 0.356930136680603,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603629112244,
+      "sampling/importance_sampling_ratio/min": 0.0006075318087823689,
+      "sampling/sampling_logp_difference/max": 7.406105995178223,
+      "sampling/sampling_logp_difference/mean": 0.02265278436243534,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.95245172057912e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95245172057912e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 6504.0625,
+      "completions/mean_terminated_length": 6347.23828125,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "entropy": 1.1040372923016548,
+      "epoch": 0.029438822447102116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004854958038777113,
+      "learning_rate": 1e-05,
+      "loss": 0.0853,
+      "num_tokens": 23899259.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2619747221469879,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999864935874939,
+      "sampling/importance_sampling_ratio/min": 3.380438373667971e-09,
+      "sampling/sampling_logp_difference/max": 19.505260467529297,
+      "sampling/sampling_logp_difference/mean": 0.020535167306661606,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 1.8890462797571672e-05,
+      "clip_ratio/high_mean": 4.722615699392918e-06,
+      "clip_ratio/low_mean": 4.095688700544997e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.567950259115605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14787.0,
+      "completions/mean_length": 5553.2578125,
+      "completions/mean_terminated_length": 5467.9765625,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 1.0357396975159645,
+      "epoch": 0.03035878564857406,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005473555997014046,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 24631956.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000377893447876,
+      "sampling/importance_sampling_ratio/min": 0.004898479674011469,
+      "sampling/sampling_logp_difference/max": 5.318830490112305,
+      "sampling/sampling_logp_difference/mean": 0.019490022212266922,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 8.08538152341498e-06,
+      "clip_ratio/high_mean": 2.021345380853745e-06,
+      "clip_ratio/low_mean": 2.4400278334724135e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6421623601891042e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15753.0,
+      "completions/max_terminated_length": 15753.0,
+      "completions/mean_length": 5357.46875,
+      "completions/mean_terminated_length": 5357.46875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0332984924316406,
+      "epoch": 0.031278748850046,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003410332603380084,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 25336544.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037384986877,
+      "sampling/importance_sampling_ratio/min": 0.00010891074634855613,
+      "sampling/sampling_logp_difference/max": 9.124981880187988,
+      "sampling/sampling_logp_difference/mean": 0.01885366439819336,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 1.2948894436703995e-05,
+      "clip_ratio/high_mean": 3.2372236091759987e-06,
+      "clip_ratio/low_mean": 3.931040214411041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.25476254122259e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7812.8984375,
+      "completions/mean_terminated_length": 7745.4091796875,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.031004011631012,
+      "epoch": 0.03219871205151794,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003122704103589058,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 26355691.0,
+      "reward": 0.2890625,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999194145202637,
+      "sampling/importance_sampling_ratio/min": 0.002222655341029167,
+      "sampling/sampling_logp_difference/max": 6.109052658081055,
+      "sampling/sampling_logp_difference/mean": 0.022181488573551178,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 1.3199577551858965e-05,
+      "clip_ratio/high_mean": 3.2998943879647413e-06,
+      "clip_ratio/low_mean": 3.742906312709238e-05,
+      "clip_ratio/low_min": 3.3127500955743017e-06,
+      "clip_ratio/region_mean": 4.072895751505712e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6402.6875,
+      "completions/mean_terminated_length": 5825.255859375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "entropy": 0.96993837505579,
+      "epoch": 0.03311867525298988,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003292364301159978,
+      "learning_rate": 1e-05,
+      "loss": 0.0211,
+      "num_tokens": 27193267.0,
+      "reward": 0.375,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000267028808594,
+      "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07,
+      "sampling/sampling_logp_difference/max": 14.72463607788086,
+      "sampling/sampling_logp_difference/mean": 0.019621271640062332,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 9.08137690203148e-06,
+      "clip_ratio/high_mean": 2.27034422550787e-06,
+      "clip_ratio/low_mean": 4.5394222524919314e-05,
+      "clip_ratio/low_min": 4.49300887339632e-06,
+      "clip_ratio/region_mean": 4.766456731886137e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 7525.40625,
+      "completions/mean_terminated_length": 7165.30078125,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.9819100275635719,
+      "epoch": 0.03403863845446182,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004192501772195101,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 28181183.0,
+      "reward": 0.3125,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999439716339111,
+      "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05,
+      "sampling/sampling_logp_difference/max": 11.227011680603027,
+      "sampling/sampling_logp_difference/mean": 0.019877666607499123,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.638440969349176e-05,
+      "clip_ratio/low_min": 6.698462129861582e-06,
+      "clip_ratio/region_mean": 2.638440969349176e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6930.8828125,
+      "completions/mean_terminated_length": 6625.943359375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9183463454246521,
+      "epoch": 0.034958601655933765,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029556062072515488,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 29087384.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2740417718887329,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
+      "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08,
+      "sampling/sampling_logp_difference/max": 16.87410545349121,
+      "sampling/sampling_logp_difference/mean": 0.0197360310703516,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 2.4210238279920304e-05,
+      "clip_ratio/high_mean": 6.052559569980076e-06,
+      "clip_ratio/low_mean": 3.344960384765727e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9502163645011024e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 5893.1796875,
+      "completions/mean_terminated_length": 5726.6591796875,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 1.05657509714365,
+      "epoch": 0.035878564857405704,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0044409241527318954,
+      "learning_rate": 1e-05,
+      "loss": 0.0042,
+      "num_tokens": 29860767.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3435155153274536,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999690055847168,
+      "sampling/importance_sampling_ratio/min": 2.243226049358782e-07,
+      "sampling/sampling_logp_difference/max": 15.3101806640625,
+      "sampling/sampling_logp_difference/mean": 0.02058839052915573,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.4493159887460934e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4493159887460934e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13471.0,
+      "completions/max_terminated_length": 13471.0,
+      "completions/mean_length": 5779.4765625,
+      "completions/mean_terminated_length": 5779.4765625,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.0302623957395554,
+      "epoch": 0.03679852805887764,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004552105907350779,
+      "learning_rate": 1e-05,
+      "loss": -0.0198,
+      "num_tokens": 30620388.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3295513987541199,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999610185623169,
+      "sampling/importance_sampling_ratio/min": 0.011830558069050312,
+      "sampling/sampling_logp_difference/max": 4.437069416046143,
+      "sampling/sampling_logp_difference/mean": 0.020457806065678596,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 9.270246664527804e-06,
+      "clip_ratio/high_mean": 2.317561666131951e-06,
+      "clip_ratio/low_mean": 3.615360617459373e-05,
+      "clip_ratio/low_min": 4.283315774955554e-06,
+      "clip_ratio/region_mean": 3.8471167840725684e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13927.0,
+      "completions/max_terminated_length": 13927.0,
+      "completions/mean_length": 5429.1328125,
+      "completions/mean_terminated_length": 5429.1328125,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9245247691869736,
+      "epoch": 0.03771849126034959,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003300054930150509,
+      "learning_rate": 1e-05,
+      "loss": 0.1138,
+      "num_tokens": 31334221.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23592591285705566,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999089241027832,
+      "sampling/importance_sampling_ratio/min": 0.00017977353127207607,
+      "sampling/sampling_logp_difference/max": 8.623812675476074,
+      "sampling/sampling_logp_difference/mean": 0.01882476732134819,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 1.5042513723528828e-05,
+      "clip_ratio/high_mean": 3.760628430882207e-06,
+      "clip_ratio/low_mean": 3.780993347390904e-05,
+      "clip_ratio/low_min": 3.7437480386870448e-06,
+      "clip_ratio/region_mean": 4.157056224585176e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14875.0,
+      "completions/mean_length": 6073.6328125,
+      "completions/mean_terminated_length": 5909.9765625,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "entropy": 1.0127769336104393,
+      "epoch": 0.03863845446182153,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004679495934396982,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 32134854.0,
+      "reward": 0.359375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999961793422699,
+      "sampling/importance_sampling_ratio/min": 0.0006151580018922687,
+      "sampling/sampling_logp_difference/max": 7.393631458282471,
+      "sampling/sampling_logp_difference/mean": 0.02106339856982231,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 1.8307343452761415e-05,
+      "clip_ratio/high_mean": 4.576835863190354e-06,
+      "clip_ratio/low_mean": 5.7316304378218774e-05,
+      "clip_ratio/low_min": 1.412125402566744e-05,
+      "clip_ratio/region_mean": 6.189314035509597e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15052.0,
+      "completions/max_terminated_length": 15052.0,
+      "completions/mean_length": 5773.015625,
+      "completions/mean_terminated_length": 5773.015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 1.0045431107282639,
+      "epoch": 0.03955841766329347,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00485749589279294,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 32897040.0,
+      "reward": 0.46875,
+      "reward_std": 0.3595343232154846,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000348091125488,
+      "sampling/importance_sampling_ratio/min": 4.862526111537591e-06,
+      "sampling/sampling_logp_difference/max": 12.233952522277832,
+      "sampling/sampling_logp_difference/mean": 0.01966444030404091,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 1.578610726937768e-05,
+      "clip_ratio/high_mean": 3.94652681734442e-06,
+      "clip_ratio/low_mean": 1.772546147549292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1671988179150503e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14572.0,
+      "completions/mean_length": 4731.3515625,
+      "completions/mean_terminated_length": 4639.5986328125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 1.0001292675733566,
+      "epoch": 0.040478380864765406,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004726089537143707,
+      "learning_rate": 1e-05,
+      "loss": 0.0589,
+      "num_tokens": 33522133.0,
+      "reward": 0.390625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999152421951294,
+      "sampling/importance_sampling_ratio/min": 0.0001548011932754889,
+      "sampling/sampling_logp_difference/max": 8.773368835449219,
+      "sampling/sampling_logp_difference/mean": 0.019276604056358337,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 7.944579010654707e-06,
+      "clip_ratio/high_mean": 1.9861447526636766e-06,
+      "clip_ratio/low_mean": 8.259907644969644e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.024605239763332e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15661.0,
+      "completions/mean_length": 6908.8984375,
+      "completions/mean_terminated_length": 6834.29150390625,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "entropy": 1.0723063945770264,
+      "epoch": 0.04139834406623735,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0011808272683992982,
+      "learning_rate": 1e-05,
+      "loss": 0.0202,
+      "num_tokens": 34429384.0,
+      "reward": 0.2421875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999626278877258,
+      "sampling/importance_sampling_ratio/min": 0.0007662919815629721,
+      "sampling/sampling_logp_difference/max": 7.173947334289551,
+      "sampling/sampling_logp_difference/mean": 0.021076666191220284,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 8.888357569958316e-06,
+      "clip_ratio/high_mean": 2.222089392489579e-06,
+      "clip_ratio/low_mean": 2.6357692036071967e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8579780860127357e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16348.0,
+      "completions/mean_length": 6679.140625,
+      "completions/mean_terminated_length": 6446.22412109375,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9413202852010727,
+      "epoch": 0.04231830726770929,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003600373398512602,
+      "learning_rate": 1e-05,
+      "loss": 0.0433,
+      "num_tokens": 35302474.0,
+      "reward": 0.3203125,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998806715011597,
+      "sampling/importance_sampling_ratio/min": 9.02900064829737e-05,
+      "sampling/sampling_logp_difference/max": 9.312483787536621,
+      "sampling/sampling_logp_difference/mean": 0.019808633252978325,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 9.364646757603623e-06,
+      "clip_ratio/high_mean": 2.3411616894009057e-06,
+      "clip_ratio/low_mean": 1.6833528775350715e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9174690351064783e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16253.0,
+      "completions/mean_length": 5954.5859375,
+      "completions/mean_terminated_length": 5872.46435546875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "entropy": 1.200403742492199,
+      "epoch": 0.04323827046918123,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003302425378933549,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 36093941.0,
+      "reward": 0.1640625,
+      "reward_std": 0.1990984082221985,
+      "rewards/accuracy_reward/mean": 0.1640625,
+      "rewards/accuracy_reward/std": 0.371787428855896,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998957514762878,
+      "sampling/importance_sampling_ratio/min": 0.0026806045789271593,
+      "sampling/sampling_logp_difference/max": 5.921712875366211,
+      "sampling/sampling_logp_difference/mean": 0.022528307512402534,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 1.2503618108894443e-05,
+      "clip_ratio/high_mean": 3.944288664570195e-06,
+      "clip_ratio/low_mean": 4.7836430894676596e-05,
+      "clip_ratio/low_min": 6.161485543998424e-06,
+      "clip_ratio/region_mean": 5.1780719331873115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15867.0,
+      "completions/mean_length": 6109.1953125,
+      "completions/mean_terminated_length": 5946.103515625,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9069097489118576,
+      "epoch": 0.04415823367065318,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005992463324218988,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 36893486.0,
+      "reward": 0.4921875,
+      "reward_std": 0.40373340249061584,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576807022095,
+      "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05,
+      "sampling/sampling_logp_difference/max": 10.624975204467773,
+      "sampling/sampling_logp_difference/mean": 0.018979201093316078,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 1.1075947440986056e-05,
+      "clip_ratio/high_mean": 2.768986860246514e-06,
+      "clip_ratio/low_mean": 2.73638818271138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.013286891473399e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15331.0,
+      "completions/mean_length": 6265.5390625,
+      "completions/mean_terminated_length": 6022.6962890625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.9107594564557076,
+      "epoch": 0.045078196872125116,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005304713733494282,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 37716027.0,
+      "reward": 0.484375,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 0.0003461121814325452,
+      "sampling/sampling_logp_difference/max": 7.968747615814209,
+      "sampling/sampling_logp_difference/mean": 0.019227473065257072,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 1.0917767667706357e-05,
+      "clip_ratio/high_mean": 3.674950448839809e-06,
+      "clip_ratio/low_mean": 3.135283236588293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.50277827010359e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15860.0,
+      "completions/mean_length": 6143.1796875,
+      "completions/mean_terminated_length": 5897.400390625,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "entropy": 0.9168931543827057,
+      "epoch": 0.045998160073597055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0017410843865945935,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 38519738.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2301519215106964,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998773336410522,
+      "sampling/importance_sampling_ratio/min": 0.0036513316445052624,
+      "sampling/sampling_logp_difference/max": 5.612663269042969,
+      "sampling/sampling_logp_difference/mean": 0.019512062892317772,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 5.4981305765977595e-06,
+      "clip_ratio/high_mean": 3.7445629459398333e-06,
+      "clip_ratio/low_mean": 2.6178069106208568e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.99226320521484e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15616.0,
+      "completions/mean_length": 7165.265625,
+      "completions/mean_terminated_length": 7092.67724609375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.9690218195319176,
+      "epoch": 0.046918123275068994,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004612576216459274,
+      "learning_rate": 1e-05,
+      "loss": 0.0544,
+      "num_tokens": 39461012.0,
+      "reward": 0.3125,
+      "reward_std": 0.35505855083465576,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000245571136475,
+      "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05,
+      "sampling/sampling_logp_difference/max": 9.999534606933594,
+      "sampling/sampling_logp_difference/mean": 0.0201116893440485,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 1.2953334362464375e-05,
+      "clip_ratio/high_mean": 3.2383335906160937e-06,
+      "clip_ratio/low_mean": 2.1866131419301382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5104465066760895e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16007.0,
+      "completions/max_terminated_length": 16007.0,
+      "completions/mean_length": 5617.9296875,
+      "completions/mean_terminated_length": 5617.9296875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "entropy": 1.0479632839560509,
+      "epoch": 0.04783808647654094,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003172830445691943,
+      "learning_rate": 1e-05,
+      "loss": -0.0235,
+      "num_tokens": 40202979.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999560117721558,
+      "sampling/importance_sampling_ratio/min": 1.229221084031451e-06,
+      "sampling/sampling_logp_difference/max": 13.609129905700684,
+      "sampling/sampling_logp_difference/mean": 0.020904643461108208,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 1.4129082956060302e-05,
+      "clip_ratio/high_mean": 4.841006557398941e-06,
+      "clip_ratio/low_mean": 4.556761541607557e-05,
+      "clip_ratio/low_min": 8.631802302261349e-06,
+      "clip_ratio/region_mean": 5.040862197347451e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 5995.3203125,
+      "completions/mean_terminated_length": 5913.51953125,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 1.022934041917324,
+      "epoch": 0.04875804967801288,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003333345288410783,
+      "learning_rate": 1e-05,
+      "loss": 0.0336,
+      "num_tokens": 40989532.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999282360076904,
+      "sampling/importance_sampling_ratio/min": 8.228168007917702e-05,
+      "sampling/sampling_logp_difference/max": 9.405362129211426,
+      "sampling/sampling_logp_difference/mean": 0.021745413541793823,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 2.286436574649997e-05,
+      "clip_ratio/high_mean": 6.531613848892448e-06,
+      "clip_ratio/low_mean": 3.960530659696815e-05,
+      "clip_ratio/low_min": 3.4269107800355414e-06,
+      "clip_ratio/region_mean": 4.6136920445860596e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15648.0,
+      "completions/mean_length": 6297.859375,
+      "completions/mean_terminated_length": 6055.79248046875,
+      "completions/min_length": 1243.0,
+      "completions/min_terminated_length": 1243.0,
+      "entropy": 0.9511058703064919,
+      "epoch": 0.04967801287948482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005538261961191893,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 41813914.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999532699584961,
+      "sampling/importance_sampling_ratio/min": 0.00017344337538816035,
+      "sampling/sampling_logp_difference/max": 8.659659385681152,
+      "sampling/sampling_logp_difference/mean": 0.019708994776010513,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 4.575737420964288e-06,
+      "clip_ratio/high_mean": 1.143934355241072e-06,
+      "clip_ratio/low_mean": 2.561447990956367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6758414151117904e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14051.0,
+      "completions/max_terminated_length": 14051.0,
+      "completions/mean_length": 4765.046875,
+      "completions/mean_terminated_length": 4765.046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9130316227674484,
+      "epoch": 0.050597976080956765,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024318129289895296,
+      "learning_rate": 1e-05,
+      "loss": -0.0177,
+      "num_tokens": 42443288.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196253418922424,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999518394470215,
+      "sampling/importance_sampling_ratio/min": 0.0001035423920257017,
+      "sampling/sampling_logp_difference/max": 9.175529479980469,
+      "sampling/sampling_logp_difference/mean": 0.01920286938548088,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 7.084159733494744e-06,
+      "clip_ratio/high_mean": 1.771039933373686e-06,
+      "clip_ratio/low_mean": 4.221943618176738e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3990476115141064e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15445.0,
+      "completions/mean_length": 6411.5,
+      "completions/mean_terminated_length": 5834.578125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.8110766112804413,
+      "epoch": 0.051517939282428704,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018957280553877354,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 43287600.0,
+      "reward": 0.3984375,
+      "reward_std": 0.1990983933210373,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212622642517,
+      "sampling/importance_sampling_ratio/min": 0.0021892013028264046,
+      "sampling/sampling_logp_difference/max": 6.124218463897705,
+      "sampling/sampling_logp_difference/mean": 0.018554572016000748,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 6.7589489844976924e-06,
+      "clip_ratio/high_mean": 1.6897372461244231e-06,
+      "clip_ratio/low_mean": 4.334260950145108e-05,
+      "clip_ratio/low_min": 8.570448699174449e-06,
+      "clip_ratio/region_mean": 4.503234697494918e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15508.0,
+      "completions/mean_length": 6552.40625,
+      "completions/mean_terminated_length": 6235.2578125,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 1.0034996420145035,
+      "epoch": 0.05243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002431448083370924,
+      "learning_rate": 1e-05,
+      "loss": 0.0274,
+      "num_tokens": 44145524.0,
+      "reward": 0.25,
+      "reward_std": 0.3114011883735657,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999876081943512,
+      "sampling/importance_sampling_ratio/min": 0.051090992987155914,
+      "sampling/sampling_logp_difference/max": 2.974147081375122,
+      "sampling/sampling_logp_difference/mean": 0.020403606817126274,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 2.1032463337178342e-05,
+      "clip_ratio/high_mean": 6.422987098630983e-06,
+      "clip_ratio/low_mean": 1.0045687076853937e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.646867417548492e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13257.0,
+      "completions/mean_length": 4688.7734375,
+      "completions/mean_terminated_length": 4408.08837890625,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.9620971381664276,
+      "epoch": 0.05335786568537258,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004493447951972485,
+      "learning_rate": 1e-05,
+      "loss": 0.0344,
+      "num_tokens": 44763895.0,
+      "reward": 0.53125,
+      "reward_std": 0.26196980476379395,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05,
+      "sampling/sampling_logp_difference/max": 10.364669799804688,
+      "sampling/sampling_logp_difference/mean": 0.01916680857539177,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 3.076594612139161e-05,
+      "clip_ratio/high_mean": 7.691486530347902e-06,
+      "clip_ratio/low_mean": 2.8500278403953416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.619176493430132e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 5381.1875,
+      "completions/mean_terminated_length": 5294.55126953125,
+      "completions/min_length": 25.0,
+      "completions/min_terminated_length": 25.0,
+      "entropy": 1.0265433564782143,
+      "epoch": 0.05427782888684453,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0049595762975513935,
+      "learning_rate": 1e-05,
+      "loss": 0.0813,
+      "num_tokens": 45470335.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998573660850525,
+      "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07,
+      "sampling/sampling_logp_difference/max": 15.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.020656142383813858,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 2.6326441002311185e-05,
+      "clip_ratio/high_mean": 6.581610250577796e-06,
+      "clip_ratio/low_mean": 3.143254116366734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8014151868992485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15006.0,
+      "completions/mean_length": 5613.84375,
+      "completions/mean_terminated_length": 5529.03955078125,
+      "completions/min_length": 55.0,
+      "completions/min_terminated_length": 55.0,
+      "entropy": 1.0289503335952759,
+      "epoch": 0.05519779208831647,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00655899103730917,
+      "learning_rate": 1e-05,
+      "loss": 0.068,
+      "num_tokens": 46206971.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999952495098114,
+      "sampling/importance_sampling_ratio/min": 0.03283476456999779,
+      "sampling/sampling_logp_difference/max": 3.4162673950195312,
+      "sampling/sampling_logp_difference/mean": 0.020495962351560593,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 3.233557390558417e-05,
+      "clip_ratio/high_mean": 8.083893476396042e-06,
+      "clip_ratio/low_mean": 3.3687326776998816e-05,
+      "clip_ratio/low_min": 5.745277576352237e-06,
+      "clip_ratio/region_mean": 4.1771219912334345e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14763.0,
+      "completions/mean_length": 5577.2890625,
+      "completions/mean_terminated_length": 5492.19677734375,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "entropy": 0.9836367890238762,
+      "epoch": 0.05611775528978841,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007459669373929501,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 46940112.0,
+      "reward": 0.4453125,
+      "reward_std": 0.39082521200180054,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000065565109253,
+      "sampling/importance_sampling_ratio/min": 8.196697649509588e-07,
+      "sampling/sampling_logp_difference/max": 14.014364242553711,
+      "sampling/sampling_logp_difference/mean": 0.018994126468896866,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 1.720242698866059e-05,
+      "clip_ratio/high_mean": 4.300606747165148e-06,
+      "clip_ratio/low_mean": 3.032099141364597e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.462159838818479e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16273.0,
+      "completions/mean_length": 6547.140625,
+      "completions/mean_terminated_length": 6311.05615234375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "entropy": 0.9028418883681297,
+      "epoch": 0.05703771849126035,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005132914055138826,
+      "learning_rate": 1e-05,
+      "loss": -0.0013,
+      "num_tokens": 47796514.0,
+      "reward": 0.46875,
+      "reward_std": 0.2751026153564453,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999970018863678,
+      "sampling/importance_sampling_ratio/min": 0.0005014563794247806,
+      "sampling/sampling_logp_difference/max": 7.597993850708008,
+      "sampling/sampling_logp_difference/mean": 0.02021491341292858,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 1.078213176697318e-05,
+      "clip_ratio/high_mean": 2.695532941743295e-06,
+      "clip_ratio/low_mean": 2.838153790207798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1077070843821275e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 7409.3125,
+      "completions/mean_terminated_length": 6811.00048828125,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8614663332700729,
+      "epoch": 0.05795768169273229,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034147046972066164,
+      "learning_rate": 1e-05,
+      "loss": 0.0196,
+      "num_tokens": 48765386.0,
+      "reward": 0.3125,
+      "reward_std": 0.27198708057403564,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998997449874878,
+      "sampling/importance_sampling_ratio/min": 4.202586751489434e-06,
+      "sampling/sampling_logp_difference/max": 12.379810333251953,
+      "sampling/sampling_logp_difference/mean": 0.01943383738398552,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 1.153353150584735e-05,
+      "clip_ratio/high_mean": 2.8833828764618374e-06,
+      "clip_ratio/low_mean": 3.695166174111364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.98350443902018e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14591.0,
+      "completions/mean_length": 6420.859375,
+      "completions/mean_terminated_length": 6181.744140625,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "entropy": 0.9671022593975067,
+      "epoch": 0.05887764489420423,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004101228900253773,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 49606280.0,
+      "reward": 0.34375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.000259009946603328,
+      "sampling/sampling_logp_difference/max": 8.258644104003906,
+      "sampling/sampling_logp_difference/mean": 0.01929381489753723,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.231768923546042e-05,
+      "clip_ratio/low_min": 5.164009053260088e-06,
+      "clip_ratio/region_mean": 4.231768923546042e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14971.0,
+      "completions/mean_length": 4852.7578125,
+      "completions/mean_terminated_length": 4761.96044921875,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "entropy": 0.9933939427137375,
+      "epoch": 0.05979760809567617,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0077895247377455235,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 50246457.0,
+      "reward": 0.40625,
+      "reward_std": 0.35400262475013733,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05,
+      "sampling/sampling_logp_difference/max": 10.624988555908203,
+      "sampling/sampling_logp_difference/mean": 0.01895500347018242,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.009997408298659e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.009997408298659e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15610.0,
+      "completions/max_terminated_length": 15610.0,
+      "completions/mean_length": 6840.03125,
+      "completions/mean_terminated_length": 6840.03125,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.9959733113646507,
+      "epoch": 0.06071757129714812,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00207411777228117,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 51141597.0,
+      "reward": 0.28125,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999240636825562,
+      "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07,
+      "sampling/sampling_logp_difference/max": 15.411253929138184,
+      "sampling/sampling_logp_difference/mean": 0.02091015875339508,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 2.297391938554938e-05,
+      "clip_ratio/high_mean": 6.853683203189576e-06,
+      "clip_ratio/low_mean": 4.6152885829542356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3006569942226633e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15249.0,
+      "completions/mean_length": 6567.3828125,
+      "completions/mean_terminated_length": 6331.7841796875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "entropy": 1.0921807065606117,
+      "epoch": 0.061637534498620056,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006496666464954615,
+      "learning_rate": 1e-05,
+      "loss": 0.0238,
+      "num_tokens": 52001758.0,
+      "reward": 0.296875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999523162841797,
+      "sampling/importance_sampling_ratio/min": 0.0026403397787362337,
+      "sampling/sampling_logp_difference/max": 5.936847686767578,
+      "sampling/sampling_logp_difference/mean": 0.021580250933766365,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 1.2290649465285242e-05,
+      "clip_ratio/high_mean": 3.0726623663213104e-06,
+      "clip_ratio/low_mean": 1.7558751551405294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0631413917726604e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16336.0,
+      "completions/mean_length": 6927.265625,
+      "completions/mean_terminated_length": 6542.84521484375,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8170016556978226,
+      "epoch": 0.062557497700092,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002272722776979208,
+      "learning_rate": 1e-05,
+      "loss": 0.021,
+      "num_tokens": 52907256.0,
+      "reward": 0.28125,
+      "reward_std": 0.22673700749874115,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999938011169434,
+      "sampling/importance_sampling_ratio/min": 6.70690099013882e-08,
+      "sampling/sampling_logp_difference/max": 16.51754379272461,
+      "sampling/sampling_logp_difference/mean": 0.01844738982617855,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 1.016177520796191e-05,
+      "clip_ratio/high_mean": 4.526967131823767e-06,
+      "clip_ratio/low_mean": 5.522496246612718e-05,
+      "clip_ratio/low_min": 4.129910394112812e-06,
+      "clip_ratio/region_mean": 5.9751928688456246e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16228.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 5889.28125,
+      "completions/mean_terminated_length": 5889.28125,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 1.0794919431209564,
+      "epoch": 0.06347746090156393,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005651532672345638,
+      "learning_rate": 1e-05,
+      "loss": 0.0382,
+      "num_tokens": 53682100.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32613158226013184,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 4.226289718189946e-07,
+      "sampling/sampling_logp_difference/max": 14.67677116394043,
+      "sampling/sampling_logp_difference/mean": 0.020069826394319534,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 5.796966888738098e-06,
+      "clip_ratio/high_mean": 1.4492417221845244e-06,
+      "clip_ratio/low_mean": 4.575056436806335e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.719980597656104e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 5909.3125,
+      "completions/mean_terminated_length": 5394.16357421875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "entropy": 0.8462172821164131,
+      "epoch": 0.06439742410303588,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002985857194289565,
+      "learning_rate": 1e-05,
+      "loss": 0.0246,
+      "num_tokens": 54456508.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 0.000453252432635054,
+      "sampling/sampling_logp_difference/max": 7.699061393737793,
+      "sampling/sampling_logp_difference/mean": 0.01927822455763817,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 1.8927265045931563e-05,
+      "clip_ratio/high_mean": 5.821615673085034e-06,
+      "clip_ratio/low_mean": 3.1553636290482245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.737525207725412e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15579.0,
+      "completions/mean_length": 7465.3984375,
+      "completions/mean_terminated_length": 7177.701171875,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.8792542889714241,
+      "epoch": 0.06531738730450783,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036494233645498753,
+      "learning_rate": 1e-05,
+      "loss": 0.0218,
+      "num_tokens": 55429663.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998987913131714,
+      "sampling/importance_sampling_ratio/min": 0.0017587440088391304,
+      "sampling/sampling_logp_difference/max": 6.343155384063721,
+      "sampling/sampling_logp_difference/mean": 0.01909823715686798,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 8.78609989740653e-06,
+      "clip_ratio/high_mean": 2.1965249743516324e-06,
+      "clip_ratio/low_mean": 3.611839565564878e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.831492040262674e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 5674.9609375,
+      "completions/mean_terminated_length": 5590.6376953125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "entropy": 0.9117730036377907,
+      "epoch": 0.06623735050597976,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003951186314225197,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 56173314.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.003206930123269558,
+      "sampling/sampling_logp_difference/max": 5.742441177368164,
+      "sampling/sampling_logp_difference/mean": 0.01932360976934433,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 1.7587798083695816e-05,
+      "clip_ratio/high_mean": 5.872955512131739e-06,
+      "clip_ratio/low_mean": 4.657158876852918e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.244454393960041e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16325.0,
+      "completions/max_terminated_length": 16325.0,
+      "completions/mean_length": 4754.5390625,
+      "completions/mean_terminated_length": 4754.5390625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.8350499644875526,
+      "epoch": 0.0671573137074517,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.005329386796802282,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 56799911.0,
+      "reward": 0.515625,
+      "reward_std": 0.4111049771308899,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337196350098,
+      "sampling/importance_sampling_ratio/min": 8.575750689487904e-05,
+      "sampling/sampling_logp_difference/max": 9.36398696899414,
+      "sampling/sampling_logp_difference/mean": 0.01792578026652336,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 1.2575374057632871e-05,
+      "clip_ratio/high_mean": 3.1438435144082177e-06,
+      "clip_ratio/low_mean": 1.8536085917730816e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1679929204765358e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16316.0,
+      "completions/mean_length": 5744.2734375,
+      "completions/mean_terminated_length": 5488.92041015625,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8065197095274925,
+      "epoch": 0.06807727690892364,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036763548851013184,
+      "learning_rate": 1e-05,
+      "loss": 0.082,
+      "num_tokens": 57553986.0,
+      "reward": 0.515625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999231696128845,
+      "sampling/importance_sampling_ratio/min": 0.00011362064105924219,
+      "sampling/sampling_logp_difference/max": 9.082645416259766,
+      "sampling/sampling_logp_difference/mean": 0.018098725005984306,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 1.877081149359583e-05,
+      "clip_ratio/high_mean": 6.101248914092139e-06,
+      "clip_ratio/low_mean": 2.6290458890798618e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239170769120392e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16088.0,
+      "completions/mean_length": 6754.5234375,
+      "completions/mean_terminated_length": 6523.41650390625,
+      "completions/min_length": 638.0,
+      "completions/min_terminated_length": 638.0,
+      "entropy": 1.013127624988556,
+      "epoch": 0.06899724011039558,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038320303428918123,
+      "learning_rate": 1e-05,
+      "loss": 0.0139,
+      "num_tokens": 58438333.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2369818389415741,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999582767486572,
+      "sampling/importance_sampling_ratio/min": 2.284922175022075e-06,
+      "sampling/sampling_logp_difference/max": 12.989178657531738,
+      "sampling/sampling_logp_difference/mean": 0.02173798717558384,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 1.9026635982299922e-05,
+      "clip_ratio/high_mean": 6.682960474790889e-06,
+      "clip_ratio/low_mean": 3.252214798976638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.920510800980992e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 12421.0,
+      "completions/mean_length": 6203.5390625,
+      "completions/mean_terminated_length": 6123.3779296875,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 1.0302691981196404,
+      "epoch": 0.06991720331186753,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004985450301319361,
+      "learning_rate": 1e-05,
+      "loss": 0.0483,
+      "num_tokens": 59249562.0,
+      "reward": 0.421875,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999778270721436,
+      "sampling/importance_sampling_ratio/min": 0.004553908482193947,
+      "sampling/sampling_logp_difference/max": 5.3917694091796875,
+      "sampling/sampling_logp_difference/mean": 0.019999932497739792,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 1.3964342088002013e-05,
+      "clip_ratio/high_mean": 3.4910855220005033e-06,
+      "clip_ratio/low_mean": 3.63567767180939e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.984786212640756e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 5727.796875,
+      "completions/mean_terminated_length": 5643.8896484375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.9781062752008438,
+      "epoch": 0.07083716651333946,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0037541294004768133,
+      "learning_rate": 1e-05,
+      "loss": 0.0236,
+      "num_tokens": 60001208.0,
+      "reward": 0.3828125,
+      "reward_std": 0.20753079652786255,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.00022466933296527714,
+      "sampling/sampling_logp_difference/max": 8.400880813598633,
+      "sampling/sampling_logp_difference/mean": 0.020555900409817696,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 2.7796927497547586e-06,
+      "clip_ratio/high_mean": 6.949231874386896e-07,
+      "clip_ratio/low_mean": 3.516969627526123e-05,
+      "clip_ratio/low_min": 4.025116595585132e-06,
+      "clip_ratio/region_mean": 3.586461934901308e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 5923.8359375,
+      "completions/mean_terminated_length": 5409.4013671875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "entropy": 0.9449758678674698,
+      "epoch": 0.07175712971481141,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.007178841158747673,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 60777899.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2977364659309387,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999773502349854,
+      "sampling/importance_sampling_ratio/min": 0.0004897661856375635,
+      "sampling/sampling_logp_difference/max": 7.621582508087158,
+      "sampling/sampling_logp_difference/mean": 0.019868161529302597,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 2.7642782697512303e-05,
+      "clip_ratio/high_mean": 9.016423746288638e-06,
+      "clip_ratio/low_mean": 4.3257180891487224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.227360486514954e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14267.0,
+      "completions/mean_length": 6171.640625,
+      "completions/mean_terminated_length": 5926.54443359375,
+      "completions/min_length": 721.0,
+      "completions/min_terminated_length": 721.0,
+      "entropy": 0.8597526922821999,
+      "epoch": 0.07267709291628335,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004776299465447664,
+      "learning_rate": 1e-05,
+      "loss": 0.0136,
+      "num_tokens": 61587141.0,
+      "reward": 0.46875,
+      "reward_std": 0.36113685369491577,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945878982544,
+      "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05,
+      "sampling/sampling_logp_difference/max": 10.124996185302734,
+      "sampling/sampling_logp_difference/mean": 0.019484341144561768,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 4.145968978264136e-05,
+      "clip_ratio/high_mean": 1.036492244566034e-05,
+      "clip_ratio/low_mean": 3.6077020070024446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.644194200409402e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15310.0,
+      "completions/mean_length": 5501.59375,
+      "completions/mean_terminated_length": 5415.9052734375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.9705724790692329,
+      "epoch": 0.07359705611775529,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007431659381836653,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "num_tokens": 62308321.0,
+      "reward": 0.453125,
+      "reward_std": 0.400318443775177,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000393390655518,
+      "sampling/importance_sampling_ratio/min": 4.54318942502141e-05,
+      "sampling/sampling_logp_difference/max": 9.999296188354492,
+      "sampling/sampling_logp_difference/mean": 0.019636545330286026,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 4.327206170273712e-06,
+      "clip_ratio/high_mean": 1.081801542568428e-06,
+      "clip_ratio/low_mean": 6.429905033655814e-05,
+      "clip_ratio/low_min": 6.3626184783061035e-06,
+      "clip_ratio/region_mean": 6.538085153806605e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15781.0,
+      "completions/mean_length": 5908.125,
+      "completions/mean_terminated_length": 5825.6376953125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8575867265462875,
+      "epoch": 0.07451701931922723,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005465450696647167,
+      "learning_rate": 1e-05,
+      "loss": 0.0797,
+      "num_tokens": 63084113.0,
+      "reward": 0.34375,
+      "reward_std": 0.39400771260261536,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999576210975647,
+      "sampling/importance_sampling_ratio/min": 4.766937126987614e-05,
+      "sampling/sampling_logp_difference/max": 9.951221466064453,
+      "sampling/sampling_logp_difference/mean": 0.018073562532663345,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 6.7512828536564484e-06,
+      "clip_ratio/high_mean": 1.6878207134141121e-06,
+      "clip_ratio/low_mean": 3.040744320514932e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.209526391856343e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15981.0,
+      "completions/max_terminated_length": 15981.0,
+      "completions/mean_length": 4906.734375,
+      "completions/mean_terminated_length": 4906.734375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "entropy": 0.9647495672106743,
+      "epoch": 0.07543698252069918,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003077819012105465,
+      "learning_rate": 1e-05,
+      "loss": -0.0104,
+      "num_tokens": 63740015.0,
+      "reward": 0.4375,
+      "reward_std": 0.2251344621181488,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000091791152954,
+      "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05,
+      "sampling/sampling_logp_difference/max": 9.879111289978027,
+      "sampling/sampling_logp_difference/mean": 0.01949312724173069,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 7.262375220307149e-06,
+      "clip_ratio/high_mean": 1.8155938050767872e-06,
+      "clip_ratio/low_mean": 3.626802561029763e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8083618960627064e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15716.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 5402.78125,
+      "completions/mean_terminated_length": 5402.78125,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "entropy": 0.9809223562479019,
+      "epoch": 0.07635694572217111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0018245981773361564,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 64450515.0,
+      "reward": 0.265625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999257922172546,
+      "sampling/importance_sampling_ratio/min": 0.0009712215978652239,
+      "sampling/sampling_logp_difference/max": 6.93695592880249,
+      "sampling/sampling_logp_difference/mean": 0.019615523517131805,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 1.1774221320592915e-05,
+      "clip_ratio/high_mean": 2.9435553301482287e-06,
+      "clip_ratio/low_mean": 4.734331901090627e-05,
+      "clip_ratio/low_min": 1.1585900665522786e-05,
+      "clip_ratio/region_mean": 5.0286874625271594e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16329.0,
+      "completions/mean_length": 6198.703125,
+      "completions/mean_terminated_length": 5870.14501953125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.8571672514081001,
+      "epoch": 0.07727690892364306,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006053395569324493,
+      "learning_rate": 1e-05,
+      "loss": 0.0645,
+      "num_tokens": 65269285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3464113473892212,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.0010333011159673333,
+      "sampling/sampling_logp_difference/max": 6.874996662139893,
+      "sampling/sampling_logp_difference/mean": 0.01869945600628853,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 2.7282983865006827e-05,
+      "clip_ratio/high_mean": 7.78695198278001e-06,
+      "clip_ratio/low_mean": 3.2358174394175876e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0145126376955886e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6016.09375,
+      "completions/mean_terminated_length": 5851.52392578125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.9883866459131241,
+      "epoch": 0.078196872125115,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030623299535363913,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 66058473.0,
+      "reward": 0.3203125,
+      "reward_std": 0.24883407354354858,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999305009841919,
+      "sampling/importance_sampling_ratio/min": 0.0016286972677335143,
+      "sampling/sampling_logp_difference/max": 6.4199748039245605,
+      "sampling/sampling_logp_difference/mean": 0.02085939608514309,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 2.9797377010254422e-06,
+      "clip_ratio/high_mean": 7.449344252563606e-07,
+      "clip_ratio/low_mean": 3.9277208315979806e-05,
+      "clip_ratio/low_min": 4.51475443696836e-06,
+      "clip_ratio/region_mean": 4.002214268439275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 7072.53125,
+      "completions/mean_terminated_length": 6924.73046875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.0157204791903496,
+      "epoch": 0.07911683532658693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038264680188149214,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 66984285.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999492764472961,
+      "sampling/importance_sampling_ratio/min": 0.0020860559307038784,
+      "sampling/sampling_logp_difference/max": 6.17248010635376,
+      "sampling/sampling_logp_difference/mean": 0.021116644144058228,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 6.0717920860042796e-06,
+      "clip_ratio/high_mean": 1.5179480215010699e-06,
+      "clip_ratio/low_mean": 3.757404465432046e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.909199278950837e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6539.8203125,
+      "completions/mean_terminated_length": 6303.56005859375,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 1.0071343630552292,
+      "epoch": 0.08003679852805888,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0044838739559054375,
+      "learning_rate": 1e-05,
+      "loss": 0.0583,
+      "num_tokens": 67840310.0,
+      "reward": 0.390625,
+      "reward_std": 0.2722293734550476,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999875426292419,
+      "sampling/importance_sampling_ratio/min": 0.001703627873212099,
+      "sampling/sampling_logp_difference/max": 6.374995231628418,
+      "sampling/sampling_logp_difference/mean": 0.020990263670682907,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 2.859010169231624e-05,
+      "clip_ratio/high_mean": 7.14752542307906e-06,
+      "clip_ratio/low_mean": 3.50394579982094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.218698381919239e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16224.0,
+      "completions/mean_length": 7204.09375,
+      "completions/mean_terminated_length": 6907.9677734375,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 0.9221752807497978,
+      "epoch": 0.08095676172953081,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034659637603908777,
+      "learning_rate": 1e-05,
+      "loss": -0.0057,
+      "num_tokens": 68782042.0,
+      "reward": 0.4140625,
+      "reward_std": 0.27958327531814575,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324083328247,
+      "sampling/importance_sampling_ratio/min": 0.0003347320598550141,
+      "sampling/sampling_logp_difference/max": 8.002180099487305,
+      "sampling/sampling_logp_difference/mean": 0.02053149789571762,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 1.7420219137420645e-05,
+      "clip_ratio/high_mean": 4.355054784355161e-06,
+      "clip_ratio/low_mean": 2.086669928758056e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.522175350350153e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14764.0,
+      "completions/mean_length": 5662.1640625,
+      "completions/mean_terminated_length": 5577.740234375,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.9678512960672379,
+      "epoch": 0.08187672493100276,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024458845146000385,
+      "learning_rate": 1e-05,
+      "loss": 0.0584,
+      "num_tokens": 69526295.0,
+      "reward": 0.4375,
+      "reward_std": 0.18543373048305511,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999278783798218,
+      "sampling/importance_sampling_ratio/min": 0.0033961546141654253,
+      "sampling/sampling_logp_difference/max": 5.6851115226745605,
+      "sampling/sampling_logp_difference/mean": 0.018346723169088364,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 8.09375796961831e-06,
+      "clip_ratio/high_mean": 2.0234394924045773e-06,
+      "clip_ratio/low_mean": 1.8629728629093734e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0653167894124635e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16219.0,
+      "completions/mean_length": 5590.71875,
+      "completions/mean_terminated_length": 5505.732421875,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.9286820441484451,
+      "epoch": 0.0827966881324747,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004733253736048937,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "num_tokens": 70262771.0,
+      "reward": 0.4609375,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999312162399292,
+      "sampling/importance_sampling_ratio/min": 1.233097464137245e-05,
+      "sampling/sampling_logp_difference/max": 11.303396224975586,
+      "sampling/sampling_logp_difference/mean": 0.019460031762719154,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 1.8371122678217944e-05,
+      "clip_ratio/high_mean": 4.592780669554486e-06,
+      "clip_ratio/low_mean": 2.489819087259093e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.949097142845858e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6182.484375,
+      "completions/mean_terminated_length": 6102.1572265625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 1.0872880518436432,
+      "epoch": 0.08371665133394664,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00468763243407011,
+      "learning_rate": 1e-05,
+      "loss": 0.0223,
+      "num_tokens": 71079953.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26933354139328003,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 9.611623681848869e-05,
+      "sampling/sampling_logp_difference/max": 9.24995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02069907821714878,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 1.579416039021453e-05,
+      "clip_ratio/high_mean": 4.633066396309005e-06,
+      "clip_ratio/low_mean": 2.6412633246764017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1045699415699346e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 5909.2265625,
+      "completions/mean_terminated_length": 5826.748046875,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.9488153457641602,
+      "epoch": 0.08463661453541858,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034273737110197544,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 71856574.0,
+      "reward": 0.4140625,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998411536216736,
+      "sampling/importance_sampling_ratio/min": 0.00016871529805939645,
+      "sampling/sampling_logp_difference/max": 8.687297821044922,
+      "sampling/sampling_logp_difference/mean": 0.019539739936590195,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 3.7449817682499997e-06,
+      "clip_ratio/high_mean": 9.362454420624999e-07,
+      "clip_ratio/low_mean": 4.2946558664880286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.388280387956911e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15933.0,
+      "completions/max_terminated_length": 15933.0,
+      "completions/mean_length": 6381.3125,
+      "completions/mean_terminated_length": 6381.3125,
+      "completions/min_length": 58.0,
+      "completions/min_terminated_length": 58.0,
+      "entropy": 0.9708949401974678,
+      "epoch": 0.08555657773689053,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003149663796648383,
+      "learning_rate": 1e-05,
+      "loss": 0.0314,
+      "num_tokens": 72696806.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.00033631984842941165,
+      "sampling/sampling_logp_difference/max": 7.997447967529297,
+      "sampling/sampling_logp_difference/mean": 0.021038895472884178,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 6.492157353932271e-06,
+      "clip_ratio/high_mean": 1.6230393384830677e-06,
+      "clip_ratio/low_mean": 4.956343445883249e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.118647413837607e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16180.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 5726.03125,
+      "completions/mean_terminated_length": 5726.03125,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.9100239053368568,
+      "epoch": 0.08647654093836246,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029015145264565945,
+      "learning_rate": 1e-05,
+      "loss": 0.0355,
+      "num_tokens": 73449210.0,
+      "reward": 0.3125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999687671661377,
+      "sampling/importance_sampling_ratio/min": 0.0001686852192506194,
+      "sampling/sampling_logp_difference/max": 8.68747615814209,
+      "sampling/sampling_logp_difference/mean": 0.020026210695505142,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 1.199616144731408e-05,
+      "clip_ratio/high_mean": 2.99904036182852e-06,
+      "clip_ratio/low_mean": 1.4287397789303213e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7286438151131733e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15624.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 5824.90625,
+      "completions/mean_terminated_length": 5824.90625,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9244210943579674,
+      "epoch": 0.08739650413983441,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0045582144521176815,
+      "learning_rate": 1e-05,
+      "loss": 0.0387,
+      "num_tokens": 74212662.0,
+      "reward": 0.4375,
+      "reward_std": 0.24777324497699738,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000125169754028,
+      "sampling/importance_sampling_ratio/min": 0.0021414682269096375,
+      "sampling/sampling_logp_difference/max": 6.146263599395752,
+      "sampling/sampling_logp_difference/mean": 0.019039880484342575,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 1.010842470350326e-05,
+      "clip_ratio/high_mean": 2.527106175875815e-06,
+      "clip_ratio/low_mean": 4.0637585470904014e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.316469153309299e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 6432.7265625,
+      "completions/mean_terminated_length": 6274.77001953125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "entropy": 0.8756264597177505,
+      "epoch": 0.08831646734130635,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0040692174807190895,
+      "learning_rate": 1e-05,
+      "loss": 0.0776,
+      "num_tokens": 75054003.0,
+      "reward": 0.4609375,
+      "reward_std": 0.35506343841552734,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998587369918823,
+      "sampling/importance_sampling_ratio/min": 0.005546991713345051,
+      "sampling/sampling_logp_difference/max": 5.194499492645264,
+      "sampling/sampling_logp_difference/mean": 0.019711513072252274,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.6582903135240485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6582903135240485e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14589.0,
+      "completions/mean_length": 5474.6796875,
+      "completions/mean_terminated_length": 5388.779296875,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9279408678412437,
+      "epoch": 0.08923643054277829,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035478502977639437,
+      "learning_rate": 1e-05,
+      "loss": 0.1137,
+      "num_tokens": 75773194.0,
+      "reward": 0.546875,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000132322311401,
+      "sampling/importance_sampling_ratio/min": 0.004276251420378685,
+      "sampling/sampling_logp_difference/max": 5.454678535461426,
+      "sampling/sampling_logp_difference/mean": 0.018789665773510933,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 8.227548732975265e-06,
+      "clip_ratio/high_mean": 2.0568871832438163e-06,
+      "clip_ratio/low_mean": 4.1461861655989196e-05,
+      "clip_ratio/low_min": 3.5008122267754516e-06,
+      "clip_ratio/region_mean": 4.351874804342515e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 6730.2734375,
+      "completions/mean_terminated_length": 6577.0400390625,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "entropy": 1.0115349367260933,
+      "epoch": 0.09015639374425023,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004816337022930384,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 76654837.0,
+      "reward": 0.40625,
+      "reward_std": 0.35325103998184204,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000003576278687,
+      "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06,
+      "sampling/sampling_logp_difference/max": 13.464577674865723,
+      "sampling/sampling_logp_difference/mean": 0.021000642329454422,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 2.0833075723203365e-05,
+      "clip_ratio/high_mean": 5.208268930800841e-06,
+      "clip_ratio/low_mean": 2.399133984454238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.919960945746425e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14350.0,
+      "completions/mean_length": 4804.5859375,
+      "completions/mean_terminated_length": 4620.7861328125,
+      "completions/min_length": 54.0,
+      "completions/min_terminated_length": 54.0,
+      "entropy": 0.8622925356030464,
+      "epoch": 0.09107635694572216,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00695947976782918,
+      "learning_rate": 1e-05,
+      "loss": -0.0188,
+      "num_tokens": 77287704.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000001072883606,
+      "sampling/importance_sampling_ratio/min": 0.051502522081136703,
+      "sampling/sampling_logp_difference/max": 2.9661245346069336,
+      "sampling/sampling_logp_difference/mean": 0.019261913374066353,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 1.2886264812550507e-05,
+      "clip_ratio/high_mean": 3.221566203137627e-06,
+      "clip_ratio/low_mean": 3.53349669239833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8556532899747253e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15997.0,
+      "completions/mean_length": 5836.25,
+      "completions/mean_terminated_length": 5753.19677734375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.8808795213699341,
+      "epoch": 0.09199632014719411,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034830078948289156,
+      "learning_rate": 1e-05,
+      "loss": 0.1412,
+      "num_tokens": 78054048.0,
+      "reward": 0.484375,
+      "reward_std": 0.29036492109298706,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999365210533142,
+      "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06,
+      "sampling/sampling_logp_difference/max": 12.792928695678711,
+      "sampling/sampling_logp_difference/mean": 0.01845550537109375,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 2.630969447636744e-05,
+      "clip_ratio/high_mean": 6.57742361909186e-06,
+      "clip_ratio/low_mean": 3.4728200375866436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1305623994958296e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13964.0,
+      "completions/mean_length": 5407.5703125,
+      "completions/mean_terminated_length": 5233.341796875,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.9438152015209198,
+      "epoch": 0.09291628334866606,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028610217850655317,
+      "learning_rate": 1e-05,
+      "loss": -0.0024,
+      "num_tokens": 78765225.0,
+      "reward": 0.390625,
+      "reward_std": 0.26037710905075073,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999171495437622,
+      "sampling/importance_sampling_ratio/min": 5.874436828889884e-05,
+      "sampling/sampling_logp_difference/max": 9.742315292358398,
+      "sampling/sampling_logp_difference/mean": 0.018839653581380844,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 1.2485550996643724e-05,
+      "clip_ratio/high_mean": 3.917444360013178e-06,
+      "clip_ratio/low_mean": 3.569766681721376e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961511060879275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15780.0,
+      "completions/mean_length": 6591.765625,
+      "completions/mean_terminated_length": 6436.33349609375,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.9185260459780693,
+      "epoch": 0.09383624655013799,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004098972305655479,
+      "learning_rate": 1e-05,
+      "loss": 0.0626,
+      "num_tokens": 79628691.0,
+      "reward": 0.40625,
+      "reward_std": 0.26932865381240845,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.001684795250184834,
+      "sampling/sampling_logp_difference/max": 6.386111259460449,
+      "sampling/sampling_logp_difference/mean": 0.02011241763830185,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 1.017276917991694e-05,
+      "clip_ratio/high_mean": 2.543192294979235e-06,
+      "clip_ratio/low_mean": 2.3897301389297354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.644049368427659e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16014.0,
+      "completions/mean_length": 6762.40625,
+      "completions/mean_terminated_length": 6371.2841796875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "entropy": 1.0496173724532127,
+      "epoch": 0.09475620975160993,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003109709592536092,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 80513135.0,
+      "reward": 0.296875,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999967098236084,
+      "sampling/importance_sampling_ratio/min": 0.0036795397754758596,
+      "sampling/sampling_logp_difference/max": 5.6049675941467285,
+      "sampling/sampling_logp_difference/mean": 0.021886618807911873,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 1.0623295338518801e-05,
+      "clip_ratio/high_mean": 2.6558238346297003e-06,
+      "clip_ratio/low_mean": 5.0279177912671e-05,
+      "clip_ratio/low_min": 6.849113788121031e-06,
+      "clip_ratio/region_mean": 5.29350020315178e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15653.0,
+      "completions/mean_length": 8151.421875,
+      "completions/mean_terminated_length": 7528.79052734375,
+      "completions/min_length": 1052.0,
+      "completions/min_terminated_length": 1052.0,
+      "entropy": 0.8989155367016792,
+      "epoch": 0.09567617295308188,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050065224058926105,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 81579941.0,
+      "reward": 0.375,
+      "reward_std": 0.36691081523895264,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999760389328003,
+      "sampling/importance_sampling_ratio/min": 0.0007560441154055297,
+      "sampling/sampling_logp_difference/max": 7.187410831451416,
+      "sampling/sampling_logp_difference/mean": 0.02017449401319027,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 7.662745701964013e-06,
+      "clip_ratio/high_mean": 1.9156864254910033e-06,
+      "clip_ratio/low_mean": 4.2927287609018094e-05,
+      "clip_ratio/low_min": 4.201963292871369e-06,
+      "clip_ratio/region_mean": 4.484297357976175e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16094.0,
+      "completions/mean_length": 6871.7265625,
+      "completions/mean_terminated_length": 6643.43212890625,
+      "completions/min_length": 1044.0,
+      "completions/min_terminated_length": 1044.0,
+      "entropy": 1.006680078804493,
+      "epoch": 0.09659613615455381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00352756236679852,
+      "learning_rate": 1e-05,
+      "loss": 0.0927,
+      "num_tokens": 82479474.0,
+      "reward": 0.3984375,
+      "reward_std": 0.33296146988868713,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000028133392334,
+      "sampling/importance_sampling_ratio/min": 0.023528963327407837,
+      "sampling/sampling_logp_difference/max": 3.749523162841797,
+      "sampling/sampling_logp_difference/mean": 0.021244853734970093,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 1.6621729173493804e-05,
+      "clip_ratio/high_mean": 5.544197733797773e-06,
+      "clip_ratio/low_mean": 2.3860119426899473e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9404316592263058e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14462.0,
+      "completions/max_terminated_length": 14462.0,
+      "completions/mean_length": 5705.6015625,
+      "completions/mean_terminated_length": 5705.6015625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.9162084609270096,
+      "epoch": 0.09751609935602576,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002348776441067457,
+      "learning_rate": 1e-05,
+      "loss": -0.0169,
+      "num_tokens": 83229071.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29249149560928345,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 1.176890145870857e-05,
+      "sampling/sampling_logp_difference/max": 11.35004997253418,
+      "sampling/sampling_logp_difference/mean": 0.01885361783206463,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.359476631383586e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.359476631383586e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 6823.90625,
+      "completions/mean_terminated_length": 6823.90625,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "entropy": 1.0139815732836723,
+      "epoch": 0.0984360625574977,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005057404283434153,
+      "learning_rate": 1e-05,
+      "loss": 0.038,
+      "num_tokens": 84119947.0,
+      "reward": 0.328125,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 0.014701711013913155,
+      "sampling/sampling_logp_difference/max": 4.219791412353516,
+      "sampling/sampling_logp_difference/mean": 0.021600374951958656,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 1.642513325350592e-05,
+      "clip_ratio/high_mean": 4.10628331337648e-06,
+      "clip_ratio/low_mean": 3.813199691649061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2238279775119736e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15516.0,
+      "completions/max_terminated_length": 15516.0,
+      "completions/mean_length": 5786.859375,
+      "completions/mean_terminated_length": 5786.859375,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 1.0515320897102356,
+      "epoch": 0.09935602575896964,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.008517255075275898,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 84879833.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3311441242694855,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.00010231334454147145,
+      "sampling/sampling_logp_difference/max": 9.187470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01993538998067379,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 7.0043706728029065e-06,
+      "clip_ratio/high_mean": 1.7510926682007266e-06,
+      "clip_ratio/low_mean": 1.4313530300569255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.606462308245682e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15836.0,
+      "completions/mean_length": 4726.2578125,
+      "completions/mean_terminated_length": 4634.46435546875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.795353539288044,
+      "epoch": 0.10027598896044158,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034334585070610046,
+      "learning_rate": 1e-05,
+      "loss": 0.0214,
+      "num_tokens": 85503162.0,
+      "reward": 0.6015625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.6015625,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000437498092651,
+      "sampling/importance_sampling_ratio/min": 0.0026589478366076946,
+      "sampling/sampling_logp_difference/max": 5.9298248291015625,
+      "sampling/sampling_logp_difference/mean": 0.018191032111644745,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 4.149239885009592e-06,
+      "clip_ratio/high_mean": 1.037309971252398e-06,
+      "clip_ratio/low_mean": 3.989860044839588e-05,
+      "clip_ratio/low_min": 4.927079316985328e-06,
+      "clip_ratio/region_mean": 4.093591041964828e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6450.140625,
+      "completions/mean_terminated_length": 5787.8837890625,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.8920315206050873,
+      "epoch": 0.10119595216191353,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006242698058485985,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 86350364.0,
+      "reward": 0.359375,
+      "reward_std": 0.27540695667266846,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999819993972778,
+      "sampling/importance_sampling_ratio/min": 0.00015162504860199988,
+      "sampling/sampling_logp_difference/max": 8.794099807739258,
+      "sampling/sampling_logp_difference/mean": 0.01948007568717003,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 4.065173015987966e-06,
+      "clip_ratio/high_mean": 1.8426849237584975e-06,
+      "clip_ratio/low_mean": 2.8560575628944207e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0403260552702704e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15253.0,
+      "completions/mean_length": 6597.9453125,
+      "completions/mean_terminated_length": 6442.611328125,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "entropy": 0.9351271465420723,
+      "epoch": 0.10211591536338546,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002594202058389783,
+      "learning_rate": 1e-05,
+      "loss": 0.018,
+      "num_tokens": 87213277.0,
+      "reward": 0.34375,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998740553855896,
+      "sampling/importance_sampling_ratio/min": 0.007402713876217604,
+      "sampling/sampling_logp_difference/max": 4.905908584594727,
+      "sampling/sampling_logp_difference/mean": 0.02082553133368492,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 3.7528520806517918e-06,
+      "clip_ratio/high_mean": 9.382130201629479e-07,
+      "clip_ratio/low_mean": 4.297400278119312e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.391221568766923e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15955.0,
+      "completions/mean_length": 7109.9140625,
+      "completions/mean_terminated_length": 7036.8896484375,
+      "completions/min_length": 881.0,
+      "completions/min_terminated_length": 881.0,
+      "entropy": 0.8797949478030205,
+      "epoch": 0.10303587856485741,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002718541072681546,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 88144530.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26485776901245117,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999973714351654,
+      "sampling/importance_sampling_ratio/min": 0.0002329955023014918,
+      "sampling/sampling_logp_difference/max": 8.36449146270752,
+      "sampling/sampling_logp_difference/mean": 0.01960277371108532,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 1.1254821401962545e-05,
+      "clip_ratio/high_mean": 2.813705350490636e-06,
+      "clip_ratio/low_mean": 4.423825043886609e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7051955789356725e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15834.0,
+      "completions/mean_length": 7361.6796875,
+      "completions/mean_terminated_length": 6513.427734375,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.9020541086792946,
+      "epoch": 0.10395584176632934,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003097688313573599,
+      "learning_rate": 1e-05,
+      "loss": 0.0854,
+      "num_tokens": 89109897.0,
+      "reward": 0.359375,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998917579650879,
+      "sampling/importance_sampling_ratio/min": 0.0010758653515949845,
+      "sampling/sampling_logp_difference/max": 6.834630012512207,
+      "sampling/sampling_logp_difference/mean": 0.01997425965964794,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 1.8235970401292434e-05,
+      "clip_ratio/high_mean": 5.248351158115838e-06,
+      "clip_ratio/low_mean": 7.228819413285237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.753654563202872e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 5957.28125,
+      "completions/mean_terminated_length": 5620.935546875,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.8262394368648529,
+      "epoch": 0.10487580496780129,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0023438548669219017,
+      "learning_rate": 1e-05,
+      "loss": 0.0869,
+      "num_tokens": 89891429.0,
+      "reward": 0.421875,
+      "reward_std": 0.3713865876197815,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998185634613037,
+      "sampling/importance_sampling_ratio/min": 8.2406731962692e-06,
+      "sampling/sampling_logp_difference/max": 11.706428527832031,
+      "sampling/sampling_logp_difference/mean": 0.018976174294948578,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 1.6280149793601595e-05,
+      "clip_ratio/high_mean": 5.4644419833493885e-06,
+      "clip_ratio/low_mean": 5.1420432782833814e-05,
+      "clip_ratio/low_min": 6.1973228184797335e-06,
+      "clip_ratio/region_mean": 5.688487522093055e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 5405.53125,
+      "completions/mean_terminated_length": 5142.04833984375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "entropy": 0.9246686547994614,
+      "epoch": 0.10579576816927323,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005619170609861612,
+      "learning_rate": 1e-05,
+      "loss": 0.0411,
+      "num_tokens": 90600721.0,
+      "reward": 0.421875,
+      "reward_std": 0.40821409225463867,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999438524246216,
+      "sampling/importance_sampling_ratio/min": 7.91921266340978e-08,
+      "sampling/sampling_logp_difference/max": 16.351388931274414,
+      "sampling/sampling_logp_difference/mean": 0.01931554079055786,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 9.228460612575873e-06,
+      "clip_ratio/high_mean": 2.307115153143968e-06,
+      "clip_ratio/low_mean": 3.463903834699522e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.694615350013919e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6754.859375,
+      "completions/mean_terminated_length": 6363.4306640625,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.952000230550766,
+      "epoch": 0.10671573137074516,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006043895613402128,
+      "learning_rate": 1e-05,
+      "loss": 0.0379,
+      "num_tokens": 91486063.0,
+      "reward": 0.3125,
+      "reward_std": 0.2527858018875122,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999067783355713,
+      "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06,
+      "sampling/sampling_logp_difference/max": 12.875,
+      "sampling/sampling_logp_difference/mean": 0.02107170596718788,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 1.460792736907024e-05,
+      "clip_ratio/high_mean": 3.65198184226756e-06,
+      "clip_ratio/low_mean": 3.14642731495951e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.511625499186266e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16377.0,
+      "completions/mean_length": 8135.8203125,
+      "completions/mean_terminated_length": 7869.75,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 1.0832853615283966,
+      "epoch": 0.10763569457221711,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00392121123149991,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 92546920.0,
+      "reward": 0.28125,
+      "reward_std": 0.2977413833141327,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874830245972,
+      "sampling/importance_sampling_ratio/min": 3.757069134735502e-05,
+      "sampling/sampling_logp_difference/max": 10.189286231994629,
+      "sampling/sampling_logp_difference/mean": 0.02211480587720871,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 3.585687591112219e-05,
+      "clip_ratio/high_mean": 8.964218977780547e-06,
+      "clip_ratio/low_mean": 3.652223790595599e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.548645733848389e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15497.0,
+      "completions/mean_length": 5476.53125,
+      "completions/mean_terminated_length": 5214.75244140625,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 1.0261689275503159,
+      "epoch": 0.10855565777368906,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00480870483443141,
+      "learning_rate": 1e-05,
+      "loss": 0.0378,
+      "num_tokens": 93270524.0,
+      "reward": 0.46875,
+      "reward_std": 0.3243093490600586,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000068187713623,
+      "sampling/importance_sampling_ratio/min": 0.02749602682888508,
+      "sampling/sampling_logp_difference/max": 3.5937137603759766,
+      "sampling/sampling_logp_difference/mean": 0.01990744285285473,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 5.126732958160574e-06,
+      "clip_ratio/high_mean": 1.2816832395401434e-06,
+      "clip_ratio/low_mean": 3.6732255466631614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8013938819858595e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7670.0625,
+      "completions/mean_terminated_length": 7165.9501953125,
+      "completions/min_length": 964.0,
+      "completions/min_terminated_length": 964.0,
+      "entropy": 0.8719229996204376,
+      "epoch": 0.10947562097516099,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003411791054531932,
+      "learning_rate": 1e-05,
+      "loss": 0.0792,
+      "num_tokens": 94271404.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999955415725708,
+      "sampling/importance_sampling_ratio/min": 2.125909531969228e-06,
+      "sampling/sampling_logp_difference/max": 13.061310768127441,
+      "sampling/sampling_logp_difference/mean": 0.01960139349102974,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 3.2011115308705484e-05,
+      "clip_ratio/high_mean": 1.0189622685174982e-05,
+      "clip_ratio/low_mean": 3.3884271260831156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4073893604945624e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15284.0,
+      "completions/mean_length": 5499.0859375,
+      "completions/mean_terminated_length": 5413.3779296875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "entropy": 0.8891193494200706,
+      "epoch": 0.11039558417663294,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0036615384742617607,
+      "learning_rate": 1e-05,
+      "loss": 0.0852,
+      "num_tokens": 94998263.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27776598930358887,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999586939811707,
+      "sampling/importance_sampling_ratio/min": 0.00029556488152593374,
+      "sampling/sampling_logp_difference/max": 8.126622200012207,
+      "sampling/sampling_logp_difference/mean": 0.01831059902906418,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 1.0020951322076144e-05,
+      "clip_ratio/high_mean": 2.505237830519036e-06,
+      "clip_ratio/low_mean": 3.4662164466681133e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.716740218351333e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15605.0,
+      "completions/mean_length": 7831.1015625,
+      "completions/mean_terminated_length": 7410.466796875,
+      "completions/min_length": 855.0,
+      "completions/min_terminated_length": 855.0,
+      "entropy": 0.9511109218001366,
+      "epoch": 0.11131554737810488,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003688640194013715,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 96020572.0,
+      "reward": 0.34375,
+      "reward_std": 0.266974538564682,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
+      "sampling/importance_sampling_ratio/min": 0.0008284422219730914,
+      "sampling/sampling_logp_difference/max": 7.095963478088379,
+      "sampling/sampling_logp_difference/mean": 0.020766064524650574,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 4.31883336204919e-06,
+      "clip_ratio/high_mean": 1.0797083405122976e-06,
+      "clip_ratio/low_mean": 4.2512260733929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.359196918812813e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16267.0,
+      "completions/mean_length": 7928.5,
+      "completions/mean_terminated_length": 7584.7802734375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 1.053833745419979,
+      "epoch": 0.11223551057957681,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002527788048610091,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 97055892.0,
+      "reward": 0.2734375,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999517560005188,
+      "sampling/importance_sampling_ratio/min": 8.097423233266454e-06,
+      "sampling/sampling_logp_difference/max": 11.72396469116211,
+      "sampling/sampling_logp_difference/mean": 0.02571871504187584,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 2.1440137970785145e-05,
+      "clip_ratio/high_mean": 5.360034492696286e-06,
+      "clip_ratio/low_mean": 5.3688914704252966e-05,
+      "clip_ratio/low_min": 1.0726187383625074e-05,
+      "clip_ratio/region_mean": 5.904894931063609e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15282.0,
+      "completions/mean_length": 7433.0,
+      "completions/mean_terminated_length": 7218.17626953125,
+      "completions/min_length": 1112.0,
+      "completions/min_terminated_length": 1112.0,
+      "entropy": 1.0001763850450516,
+      "epoch": 0.11315547378104876,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004057250916957855,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 98026604.0,
+      "reward": 0.3046875,
+      "reward_std": 0.30274903774261475,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719858169556,
+      "sampling/importance_sampling_ratio/min": 0.0026400478091090918,
+      "sampling/sampling_logp_difference/max": 5.936958312988281,
+      "sampling/sampling_logp_difference/mean": 0.020892417058348656,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 7.200895424830378e-06,
+      "clip_ratio/high_mean": 1.8002238562075945e-06,
+      "clip_ratio/low_mean": 3.0267089357494115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.206731355476222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15435.0,
+      "completions/mean_length": 6529.8046875,
+      "completions/mean_terminated_length": 6211.92724609375,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.0204281583428383,
+      "epoch": 0.1140754369825207,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004367270041257143,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 98882667.0,
+      "reward": 0.421875,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 0.00013984176621306688,
+      "sampling/sampling_logp_difference/max": 8.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.020555414259433746,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 3.583304760468309e-06,
+      "clip_ratio/high_mean": 8.958261901170772e-07,
+      "clip_ratio/low_mean": 3.819216192368913e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.908798782958911e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 7329.9140625,
+      "completions/mean_terminated_length": 6806.12353515625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.8461082950234413,
+      "epoch": 0.11499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0014496444491669536,
+      "learning_rate": 1e-05,
+      "loss": 0.027,
+      "num_tokens": 99847384.0,
+      "reward": 0.375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999719262123108,
+      "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05,
+      "sampling/sampling_logp_difference/max": 10.749985694885254,
+      "sampling/sampling_logp_difference/mean": 0.019216356799006462,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 1.0716735232563224e-05,
+      "clip_ratio/high_mean": 2.679183808140806e-06,
+      "clip_ratio/low_mean": 3.4717084645308205e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7396268680822686e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15200.0,
+      "completions/mean_length": 6518.4765625,
+      "completions/mean_terminated_length": 6200.23388671875,
+      "completions/min_length": 969.0,
+      "completions/min_terminated_length": 969.0,
+      "entropy": 0.880072832107544,
+      "epoch": 0.11591536338546458,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006009541917592287,
+      "learning_rate": 1e-05,
+      "loss": 0.0475,
+      "num_tokens": 100699437.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701976776123,
+      "sampling/importance_sampling_ratio/min": 6.729899905622005e-05,
+      "sampling/sampling_logp_difference/max": 9.606365203857422,
+      "sampling/sampling_logp_difference/mean": 0.01985173299908638,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 7.563064400528674e-06,
+      "clip_ratio/high_mean": 1.8907661001321685e-06,
+      "clip_ratio/low_mean": 3.8401355027417594e-05,
+      "clip_ratio/low_min": 3.4494178180466406e-06,
+      "clip_ratio/region_mean": 4.029212129808002e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 8421.9296875,
+      "completions/mean_terminated_length": 8030.35205078125,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.929582305252552,
+      "epoch": 0.11683532658693652,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00254544778726995,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 101797124.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999698400497437,
+      "sampling/importance_sampling_ratio/min": 2.139152456948068e-05,
+      "sampling/sampling_logp_difference/max": 10.75251579284668,
+      "sampling/sampling_logp_difference/mean": 0.020804740488529205,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 8.503243861923693e-06,
+      "clip_ratio/high_mean": 2.125810965480923e-06,
+      "clip_ratio/low_mean": 3.5734614471039094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7860425095459505e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14871.0,
+      "completions/mean_length": 6452.5859375,
+      "completions/mean_terminated_length": 6214.232421875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9258717745542526,
+      "epoch": 0.11775528978840846,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030309113208204508,
+      "learning_rate": 1e-05,
+      "loss": -0.0048,
+      "num_tokens": 102643751.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05,
+      "sampling/sampling_logp_difference/max": 10.81167221069336,
+      "sampling/sampling_logp_difference/mean": 0.02046305686235428,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 1.4670421251139487e-05,
+      "clip_ratio/high_mean": 4.865382209118252e-06,
+      "clip_ratio/low_mean": 2.8848363626821083e-05,
+      "clip_ratio/low_min": 3.2798930078570265e-06,
+      "clip_ratio/region_mean": 3.371374566540908e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 7655.140625,
+      "completions/mean_terminated_length": 7373.564453125,
+      "completions/min_length": 1095.0,
+      "completions/min_terminated_length": 1095.0,
+      "entropy": 1.1112212240695953,
+      "epoch": 0.11867525298988041,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0028038588352501392,
+      "learning_rate": 1e-05,
+      "loss": 0.0525,
+      "num_tokens": 103645849.0,
+      "reward": 0.390625,
+      "reward_std": 0.24435339868068695,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999130964279175,
+      "sampling/importance_sampling_ratio/min": 0.022794192656874657,
+      "sampling/sampling_logp_difference/max": 3.781249523162842,
+      "sampling/sampling_logp_difference/mean": 0.022147968411445618,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.8828401809732895e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8828401809732895e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 8166.765625,
+      "completions/mean_terminated_length": 7618.9501953125,
+      "completions/min_length": 838.0,
+      "completions/min_terminated_length": 838.0,
+      "entropy": 0.8589507639408112,
+      "epoch": 0.11959521619135234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003990175202488899,
+      "learning_rate": 1e-05,
+      "loss": 0.0942,
+      "num_tokens": 104712987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2680353820323944,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999572038650513,
+      "sampling/importance_sampling_ratio/min": 2.430162021482829e-05,
+      "sampling/sampling_logp_difference/max": 10.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.019254228100180626,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 7.719492032265407e-06,
+      "clip_ratio/high_mean": 1.9298730080663518e-06,
+      "clip_ratio/low_mean": 3.547307028384239e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7402943462439e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15444.0,
+      "completions/mean_length": 5872.40625,
+      "completions/mean_terminated_length": 5789.6376953125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 1.0606305003166199,
+      "epoch": 0.12051517939282429,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0038855294696986675,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 105481743.0,
+      "reward": 0.375,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999517202377319,
+      "sampling/importance_sampling_ratio/min": 0.0018136304570361972,
+      "sampling/sampling_logp_difference/max": 6.312424659729004,
+      "sampling/sampling_logp_difference/mean": 0.021132031455636024,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 1.6221786609094124e-05,
+      "clip_ratio/high_mean": 5.614050223812228e-06,
+      "clip_ratio/low_mean": 4.114894863960217e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6762998408667045e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15742.0,
+      "completions/mean_length": 6474.9375,
+      "completions/mean_terminated_length": 6237.1201171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "entropy": 0.8699874132871628,
+      "epoch": 0.12143514259429623,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004712321795523167,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 106333695.0,
+      "reward": 0.53125,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 4.115129559068009e-05,
+      "sampling/sampling_logp_difference/max": 10.098255157470703,
+      "sampling/sampling_logp_difference/mean": 0.019161570817232132,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 1.2752746897604084e-05,
+      "clip_ratio/high_mean": 3.188186724401021e-06,
+      "clip_ratio/low_mean": 2.881602637216929e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.200421309657031e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 6910.03125,
+      "completions/mean_terminated_length": 6604.4189453125,
+      "completions/min_length": 1212.0,
+      "completions/min_terminated_length": 1212.0,
+      "entropy": 0.8597542196512222,
+      "epoch": 0.12235510579576817,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031310587655752897,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 107236363.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000447034835815,
+      "sampling/importance_sampling_ratio/min": 0.0012788315070793033,
+      "sampling/sampling_logp_difference/max": 6.661808490753174,
+      "sampling/sampling_logp_difference/mean": 0.019823957234621048,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 1.2087368986613e-05,
+      "clip_ratio/high_mean": 3.02184224665325e-06,
+      "clip_ratio/low_mean": 3.179941927555774e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.482126135168073e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15311.0,
+      "completions/mean_length": 6172.7421875,
+      "completions/mean_terminated_length": 5843.3466796875,
+      "completions/min_length": 691.0,
+      "completions/min_terminated_length": 691.0,
+      "entropy": 0.9560965895652771,
+      "epoch": 0.12327506899724011,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006233204621821642,
+      "learning_rate": 1e-05,
+      "loss": -0.0101,
+      "num_tokens": 108044714.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2845958471298218,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743700027466,
+      "sampling/importance_sampling_ratio/min": 0.0012860872084274888,
+      "sampling/sampling_logp_difference/max": 6.656150817871094,
+      "sampling/sampling_logp_difference/mean": 0.020428352057933807,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 1.846628038038034e-05,
+      "clip_ratio/high_mean": 4.616570095095085e-06,
+      "clip_ratio/low_mean": 3.8776780229454744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.339335077929718e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15737.0,
+      "completions/mean_length": 6232.4609375,
+      "completions/mean_terminated_length": 5988.82421875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.792289063334465,
+      "epoch": 0.12419503219871206,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005762661807239056,
+      "learning_rate": 1e-05,
+      "loss": 0.1106,
+      "num_tokens": 108862901.0,
+      "reward": 0.53125,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 0.004497833084315062,
+      "sampling/sampling_logp_difference/max": 5.4041595458984375,
+      "sampling/sampling_logp_difference/mean": 0.01772497221827507,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 1.1131890460092109e-05,
+      "clip_ratio/high_mean": 2.782972615023027e-06,
+      "clip_ratio/low_mean": 3.377504378931917e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.65580164043422e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15440.0,
+      "completions/mean_length": 5181.1015625,
+      "completions/mean_terminated_length": 5003.27783203125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.7691714614629745,
+      "epoch": 0.125114995400184,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002916123950853944,
+      "learning_rate": 1e-05,
+      "loss": 0.088,
+      "num_tokens": 109544058.0,
+      "reward": 0.5625,
+      "reward_std": 0.3327339291572571,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07,
+      "sampling/sampling_logp_difference/max": 14.749001502990723,
+      "sampling/sampling_logp_difference/mean": 0.017177307978272438,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 2.2183079636306502e-05,
+      "clip_ratio/high_mean": 5.5457699090766255e-06,
+      "clip_ratio/low_mean": 3.033036318811355e-05,
+      "clip_ratio/low_min": 3.5457974263408687e-06,
+      "clip_ratio/region_mean": 3.587613309719018e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15288.0,
+      "completions/mean_length": 5583.5859375,
+      "completions/mean_terminated_length": 5235.185546875,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.922084204852581,
+      "epoch": 0.12603495860165592,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035183338914066553,
+      "learning_rate": 1e-05,
+      "loss": 0.0303,
+      "num_tokens": 110282853.0,
+      "reward": 0.484375,
+      "reward_std": 0.24381661415100098,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999736547470093,
+      "sampling/importance_sampling_ratio/min": 0.0021202145144343376,
+      "sampling/sampling_logp_difference/max": 6.156238079071045,
+      "sampling/sampling_logp_difference/mean": 0.01895858161151409,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 2.7135570235259365e-06,
+      "clip_ratio/high_mean": 6.783892558814841e-07,
+      "clip_ratio/low_mean": 2.520359919344628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.588198810826725e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16255.0,
+      "completions/mean_length": 7191.71875,
+      "completions/mean_terminated_length": 6659.93359375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.8676051273941994,
+      "epoch": 0.12695492180312787,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002201368333771825,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 111228449.0,
+      "reward": 0.296875,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998859167098999,
+      "sampling/importance_sampling_ratio/min": 0.0047781821340322495,
+      "sampling/sampling_logp_difference/max": 5.343695163726807,
+      "sampling/sampling_logp_difference/mean": 0.01915489323437214,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 2.2828588043921627e-05,
+      "clip_ratio/high_mean": 7.982446049936698e-06,
+      "clip_ratio/low_mean": 4.164742210832628e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962986872669717e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 5139.5859375,
+      "completions/mean_terminated_length": 4869.72021484375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.7077975794672966,
+      "epoch": 0.12787488500459981,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00720562506467104,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 111904700.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3566659688949585,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 9.015951036417391e-06,
+      "sampling/sampling_logp_difference/max": 11.616515159606934,
+      "sampling/sampling_logp_difference/mean": 0.016763046383857727,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 1.3030461104790447e-05,
+      "clip_ratio/high_mean": 3.257615276197612e-06,
+      "clip_ratio/low_mean": 5.0197708333143964e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.345532326828106e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15887.0,
+      "completions/mean_length": 7408.296875,
+      "completions/mean_terminated_length": 7118.7578125,
+      "completions/min_length": 678.0,
+      "completions/min_terminated_length": 678.0,
+      "entropy": 0.8338208198547363,
+      "epoch": 0.12879484820607176,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005594039335846901,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 112873218.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2806568741798401,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697804450989,
+      "sampling/importance_sampling_ratio/min": 5.832135502714664e-05,
+      "sampling/sampling_logp_difference/max": 9.749542236328125,
+      "sampling/sampling_logp_difference/mean": 0.018874341621994972,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 2.6347406674176455e-06,
+      "clip_ratio/high_mean": 6.586851668544114e-07,
+      "clip_ratio/low_mean": 3.066379792926455e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132248309611896e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16158.0,
+      "completions/mean_length": 7637.25,
+      "completions/mean_terminated_length": 7131.2392578125,
+      "completions/min_length": 943.0,
+      "completions/min_terminated_length": 943.0,
+      "entropy": 0.9943022206425667,
+      "epoch": 0.1297148114075437,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025491444393992424,
+      "learning_rate": 1e-05,
+      "loss": 0.089,
+      "num_tokens": 113869418.0,
+      "reward": 0.3046875,
+      "reward_std": 0.32641828060150146,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999139308929443,
+      "sampling/importance_sampling_ratio/min": 6.724766876686772e-07,
+      "sampling/sampling_logp_difference/max": 14.212298393249512,
+      "sampling/sampling_logp_difference/mean": 0.020018339157104492,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 1.7491673133918084e-05,
+      "clip_ratio/high_mean": 4.372918283479521e-06,
+      "clip_ratio/low_mean": 2.370427267806008e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8077190734165924e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 6144.8671875,
+      "completions/mean_terminated_length": 6064.244140625,
+      "completions/min_length": 1000.0,
+      "completions/min_terminated_length": 1000.0,
+      "entropy": 0.9252935722470284,
+      "epoch": 0.13063477460901565,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003907687962055206,
+      "learning_rate": 1e-05,
+      "loss": 0.1115,
+      "num_tokens": 114674257.0,
+      "reward": 0.5078125,
+      "reward_std": 0.287486732006073,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485611915588,
+      "sampling/importance_sampling_ratio/min": 0.003434742335230112,
+      "sampling/sampling_logp_difference/max": 5.673813343048096,
+      "sampling/sampling_logp_difference/mean": 0.018300339579582214,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 8.272644663520623e-06,
+      "clip_ratio/high_mean": 2.0681611658801557e-06,
+      "clip_ratio/low_mean": 2.688816772433711e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8956328833373846e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15768.0,
+      "completions/mean_length": 6266.6484375,
+      "completions/mean_terminated_length": 6186.984375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 1.0926234126091003,
+      "epoch": 0.13155473781048757,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0042014638893306255,
+      "learning_rate": 1e-05,
+      "loss": 0.023,
+      "num_tokens": 115496300.0,
+      "reward": 0.3671875,
+      "reward_std": 0.18884867429733276,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999801516532898,
+      "sampling/importance_sampling_ratio/min": 3.502686922729481e-06,
+      "sampling/sampling_logp_difference/max": 12.561980247497559,
+      "sampling/sampling_logp_difference/mean": 0.021998615935444832,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 2.7441840302344644e-05,
+      "clip_ratio/high_mean": 6.860460075586161e-06,
+      "clip_ratio/low_mean": 4.51459295618406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.200638997848728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15504.0,
+      "completions/mean_length": 6392.890625,
+      "completions/mean_terminated_length": 6234.3017578125,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "entropy": 0.9028401970863342,
+      "epoch": 0.13247470101195952,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028757627587765455,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 116333286.0,
+      "reward": 0.4453125,
+      "reward_std": 0.35665616393089294,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000064373016357,
+      "sampling/importance_sampling_ratio/min": 2.327528392243039e-07,
+      "sampling/sampling_logp_difference/max": 15.27328872680664,
+      "sampling/sampling_logp_difference/mean": 0.019069479778409004,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 1.216986993313185e-05,
+      "clip_ratio/high_mean": 3.0424674832829623e-06,
+      "clip_ratio/low_mean": 3.626850991622632e-05,
+      "clip_ratio/low_min": 4.492201696848497e-06,
+      "clip_ratio/region_mean": 3.931097762688296e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16183.0,
+      "completions/mean_length": 6300.1640625,
+      "completions/mean_terminated_length": 6220.763671875,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "entropy": 1.110174722969532,
+      "epoch": 0.13339466421343146,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006776242982596159,
+      "learning_rate": 1e-05,
+      "loss": 0.0858,
+      "num_tokens": 117158619.0,
+      "reward": 0.3125,
+      "reward_std": 0.29826053977012634,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998651742935181,
+      "sampling/importance_sampling_ratio/min": 6.477496299339691e-07,
+      "sampling/sampling_logp_difference/max": 14.249761581420898,
+      "sampling/sampling_logp_difference/mean": 0.022119753062725067,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 7.707248187216464e-06,
+      "clip_ratio/high_mean": 1.926812046804116e-06,
+      "clip_ratio/low_mean": 1.452984838579141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.6456660432595527e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7482.25,
+      "completions/mean_terminated_length": 7340.95263671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "entropy": 0.9957183450460434,
+      "epoch": 0.1343146274149034,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003441061358898878,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 118140579.0,
+      "reward": 0.2109375,
+      "reward_std": 0.23250605165958405,
+      "rewards/accuracy_reward/mean": 0.2109375,
+      "rewards/accuracy_reward/std": 0.4095771610736847,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999172687530518,
+      "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05,
+      "sampling/sampling_logp_difference/max": 11.356839179992676,
+      "sampling/sampling_logp_difference/mean": 0.020916422829031944,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 1.3650881555804517e-05,
+      "clip_ratio/high_mean": 3.4127203889511293e-06,
+      "clip_ratio/low_mean": 4.652173765862244e-05,
+      "clip_ratio/low_min": 8.251542112702737e-06,
+      "clip_ratio/region_mean": 4.993445759282622e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6411.125,
+      "completions/mean_terminated_length": 6252.82568359375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.9852773621678352,
+      "epoch": 0.13523459061637536,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0035624606534838676,
+      "learning_rate": 1e-05,
+      "loss": 0.0601,
+      "num_tokens": 118982515.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3913620114326477,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991655349731,
+      "sampling/importance_sampling_ratio/min": 0.018960632383823395,
+      "sampling/sampling_logp_difference/max": 3.96539044380188,
+      "sampling/sampling_logp_difference/mean": 0.020998675376176834,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 1.710706237645354e-05,
+      "clip_ratio/high_mean": 4.276765594113385e-06,
+      "clip_ratio/low_mean": 2.3662243620492518e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7939009100919066e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15146.0,
+      "completions/mean_length": 6640.75,
+      "completions/mean_terminated_length": 6326.45166015625,
+      "completions/min_length": 1204.0,
+      "completions/min_terminated_length": 1204.0,
+      "entropy": 0.8645239844918251,
+      "epoch": 0.13615455381784727,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004949269350618124,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 119851003.0,
+      "reward": 0.515625,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05,
+      "sampling/sampling_logp_difference/max": 9.749635696411133,
+      "sampling/sampling_logp_difference/mean": 0.01905224658548832,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 5.033624802308623e-06,
+      "clip_ratio/high_mean": 2.0922732346662087e-06,
+      "clip_ratio/low_mean": 5.667686264132499e-05,
+      "clip_ratio/low_min": 3.2221478249994107e-06,
+      "clip_ratio/region_mean": 5.876913564861752e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16061.0,
+      "completions/mean_length": 6987.953125,
+      "completions/mean_terminated_length": 6444.3798828125,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9469119384884834,
+      "epoch": 0.13707451701931922,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005622676108032465,
+      "learning_rate": 1e-05,
+      "loss": 0.1008,
+      "num_tokens": 120765165.0,
+      "reward": 0.421875,
+      "reward_std": 0.39796435832977295,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999727010726929,
+      "sampling/importance_sampling_ratio/min": 9.214873716700822e-05,
+      "sampling/sampling_logp_difference/max": 9.292106628417969,
+      "sampling/sampling_logp_difference/mean": 0.01969297230243683,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 4.223829364491394e-06,
+      "clip_ratio/high_mean": 1.8565209529697313e-06,
+      "clip_ratio/low_mean": 3.030186894648068e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.21583895583899e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16218.0,
+      "completions/mean_length": 7539.2265625,
+      "completions/mean_terminated_length": 6949.5751953125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "entropy": 0.983614593744278,
+      "epoch": 0.13799448022079117,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035846447572112083,
+      "learning_rate": 1e-05,
+      "loss": -0.0093,
+      "num_tokens": 121749426.0,
+      "reward": 0.3828125,
+      "reward_std": 0.22461043298244476,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000232458114624,
+      "sampling/importance_sampling_ratio/min": 7.889377229730599e-06,
+      "sampling/sampling_logp_difference/max": 11.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.02050059661269188,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 1.0533551176195033e-05,
+      "clip_ratio/high_mean": 2.6333877940487582e-06,
+      "clip_ratio/low_mean": 4.44662659901951e-05,
+      "clip_ratio/low_min": 5.9182802942814305e-06,
+      "clip_ratio/region_mean": 4.7099654238991207e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15528.0,
+      "completions/mean_length": 6339.5390625,
+      "completions/mean_terminated_length": 5845.548828125,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.9051830619573593,
+      "epoch": 0.1389144434222631,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005333681590855122,
+      "learning_rate": 1e-05,
+      "loss": 0.0479,
+      "num_tokens": 122579975.0,
+      "reward": 0.34375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999289512634277,
+      "sampling/importance_sampling_ratio/min": 0.0022614477202296257,
+      "sampling/sampling_logp_difference/max": 6.091750144958496,
+      "sampling/sampling_logp_difference/mean": 0.019756250083446503,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 5.961464921711013e-06,
+      "clip_ratio/high_mean": 1.4903662304277532e-06,
+      "clip_ratio/low_mean": 5.054293433204293e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2033300562470686e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6610.8046875,
+      "completions/mean_terminated_length": 6533.8505859375,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9121239259839058,
+      "epoch": 0.13983440662373506,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005628545768558979,
+      "learning_rate": 1e-05,
+      "loss": 0.1029,
+      "num_tokens": 123444686.0,
+      "reward": 0.5,
+      "reward_std": 0.3498311936855316,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.0027667356189340353,
+      "sampling/sampling_logp_difference/max": 5.890087127685547,
+      "sampling/sampling_logp_difference/mean": 0.019961554557085037,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 7.918152277852641e-06,
+      "clip_ratio/high_mean": 2.778689122351352e-06,
+      "clip_ratio/low_mean": 4.231535649523721e-05,
+      "clip_ratio/low_min": 3.3862490909086773e-06,
+      "clip_ratio/region_mean": 4.509404539021489e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15071.0,
+      "completions/mean_length": 7214.5546875,
+      "completions/mean_terminated_length": 6684.0908203125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.9393481463193893,
+      "epoch": 0.140754369825207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00521192466840148,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 124389325.0,
+      "reward": 0.25,
+      "reward_std": 0.26538968086242676,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000994205474854,
+      "sampling/importance_sampling_ratio/min": 0.03890184313058853,
+      "sampling/sampling_logp_difference/max": 3.246713638305664,
+      "sampling/sampling_logp_difference/mean": 0.02030467614531517,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 1.3099364878144115e-05,
+      "clip_ratio/high_mean": 3.274841219536029e-06,
+      "clip_ratio/low_mean": 4.0359405488743505e-05,
+      "clip_ratio/low_min": 3.400342848181026e-06,
+      "clip_ratio/region_mean": 4.363424682196637e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15349.0,
+      "completions/mean_length": 7307.296875,
+      "completions/mean_terminated_length": 6938.32470703125,
+      "completions/min_length": 656.0,
+      "completions/min_terminated_length": 656.0,
+      "entropy": 0.9287968128919601,
+      "epoch": 0.14167433302667892,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034769594203680754,
+      "learning_rate": 1e-05,
+      "loss": 0.0557,
+      "num_tokens": 125344827.0,
+      "reward": 0.390625,
+      "reward_std": 0.35035035014152527,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738335609436,
+      "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05,
+      "sampling/sampling_logp_difference/max": 9.88245964050293,
+      "sampling/sampling_logp_difference/mean": 0.0197945274412632,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 6.428839697036892e-06,
+      "clip_ratio/high_mean": 1.607209924259223e-06,
+      "clip_ratio/low_mean": 3.123730675724801e-05,
+      "clip_ratio/low_min": 4.124868155486183e-06,
+      "clip_ratio/region_mean": 3.284451713625458e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14940.0,
+      "completions/mean_length": 7255.5703125,
+      "completions/mean_terminated_length": 7110.6748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.9288185387849808,
+      "epoch": 0.14259429622815087,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005860861856490374,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "num_tokens": 126294060.0,
+      "reward": 0.3359375,
+      "reward_std": 0.29719966650009155,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999392032623291,
+      "sampling/importance_sampling_ratio/min": 0.0017037172801792622,
+      "sampling/sampling_logp_difference/max": 6.374942779541016,
+      "sampling/sampling_logp_difference/mean": 0.019849762320518494,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 1.148841965914471e-05,
+      "clip_ratio/high_mean": 2.8721049147861777e-06,
+      "clip_ratio/low_mean": 4.209472854199703e-05,
+      "clip_ratio/low_min": 3.21056154461985e-06,
+      "clip_ratio/region_mean": 4.496683322940953e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16174.0,
+      "completions/mean_length": 6662.796875,
+      "completions/mean_terminated_length": 6429.48828125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.8072321340441704,
+      "epoch": 0.14351425942962281,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004109901376068592,
+      "learning_rate": 1e-05,
+      "loss": 0.0365,
+      "num_tokens": 127163746.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998985528945923,
+      "sampling/importance_sampling_ratio/min": 0.001930873841047287,
+      "sampling/sampling_logp_difference/max": 6.249782562255859,
+      "sampling/sampling_logp_difference/mean": 0.018542557954788208,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 1.4845912573946407e-05,
+      "clip_ratio/high_mean": 3.7114781434866018e-06,
+      "clip_ratio/low_mean": 3.845731936280572e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.216879796103967e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16168.0,
+      "completions/mean_length": 6638.5625,
+      "completions/mean_terminated_length": 6483.87353515625,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9228496253490448,
+      "epoch": 0.14443422263109476,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005437003914266825,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "num_tokens": 128035690.0,
+      "reward": 0.4453125,
+      "reward_std": 0.325370192527771,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999808073043823,
+      "sampling/importance_sampling_ratio/min": 0.0007831641123630106,
+      "sampling/sampling_logp_difference/max": 7.152168273925781,
+      "sampling/sampling_logp_difference/mean": 0.019497953355312347,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.087737986286811e-05,
+      "clip_ratio/low_min": 1.7309419035882456e-05,
+      "clip_ratio/region_mean": 5.087737986286811e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 8279.7890625,
+      "completions/mean_terminated_length": 7810.9501953125,
+      "completions/min_length": 1084.0,
+      "completions/min_terminated_length": 1084.0,
+      "entropy": 0.9365477114915848,
+      "epoch": 0.1453541858325667,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004606325179338455,
+      "learning_rate": 1e-05,
+      "loss": 0.0553,
+      "num_tokens": 129114487.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999224543571472,
+      "sampling/importance_sampling_ratio/min": 6.793912234570598e-06,
+      "sampling/sampling_logp_difference/max": 11.899483680725098,
+      "sampling/sampling_logp_difference/mean": 0.02114839106798172,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 2.8393386855896097e-05,
+      "clip_ratio/high_mean": 7.731617188255768e-06,
+      "clip_ratio/low_mean": 4.6293902641991735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.402551937550015e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15157.0,
+      "completions/mean_length": 6874.5546875,
+      "completions/mean_terminated_length": 6406.87646484375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.8596161976456642,
+      "epoch": 0.14627414903403863,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032216343097388744,
+      "learning_rate": 1e-05,
+      "loss": 0.0979,
+      "num_tokens": 130011934.0,
+      "reward": 0.46875,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999432563781738,
+      "sampling/importance_sampling_ratio/min": 0.0028106109239161015,
+      "sampling/sampling_logp_difference/max": 5.874353408813477,
+      "sampling/sampling_logp_difference/mean": 0.01938377134501934,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 9.702946044853888e-06,
+      "clip_ratio/high_mean": 2.425736511213472e-06,
+      "clip_ratio/low_mean": 2.8597237701433187e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1022973985272984e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16193.0,
+      "completions/mean_length": 6554.3671875,
+      "completions/mean_terminated_length": 6154.78857421875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9097465947270393,
+      "epoch": 0.14719411223551057,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0032756594009697437,
+      "learning_rate": 1e-05,
+      "loss": 0.0401,
+      "num_tokens": 130870045.0,
+      "reward": 0.453125,
+      "reward_std": 0.3006146550178528,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 9.237467679668043e-09,
+      "sampling/sampling_logp_difference/max": 18.499998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019287925213575363,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 2.387705990258837e-05,
+      "clip_ratio/high_mean": 5.969264975647093e-06,
+      "clip_ratio/low_mean": 4.071546266004589e-05,
+      "clip_ratio/low_min": 2.701884795897058e-06,
+      "clip_ratio/region_mean": 4.6684727863066655e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15371.0,
+      "completions/mean_length": 7199.9921875,
+      "completions/mean_terminated_length": 6903.73388671875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.9904173016548157,
+      "epoch": 0.14811407543698252,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003530750283971429,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "num_tokens": 131812236.0,
+      "reward": 0.3125,
+      "reward_std": 0.30221718549728394,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999665021896362,
+      "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06,
+      "sampling/sampling_logp_difference/max": 12.864561080932617,
+      "sampling/sampling_logp_difference/mean": 0.02212757244706154,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 1.924166053868248e-05,
+      "clip_ratio/high_mean": 4.81041513467062e-06,
+      "clip_ratio/low_mean": 4.526082898337336e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.007124354960979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 6822.59375,
+      "completions/mean_terminated_length": 6670.82568359375,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "entropy": 1.0052980855107307,
+      "epoch": 0.14903403863845446,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004500554408878088,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 132711448.0,
+      "reward": 0.3203125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998499751091003,
+      "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07,
+      "sampling/sampling_logp_difference/max": 15.995189666748047,
+      "sampling/sampling_logp_difference/mean": 0.02111719362437725,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 1.3326032785698771e-05,
+      "clip_ratio/high_mean": 3.331508196424693e-06,
+      "clip_ratio/low_mean": 1.9409651486057555e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.274115956879541e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7792.9375,
+      "completions/mean_terminated_length": 7515.80615234375,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.9114394783973694,
+      "epoch": 0.1499540018399264,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020288117229938507,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 133729832.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2501322627067566,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999821782112122,
+      "sampling/importance_sampling_ratio/min": 0.001612494932487607,
+      "sampling/sampling_logp_difference/max": 6.4299726486206055,
+      "sampling/sampling_logp_difference/mean": 0.020228523761034012,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 1.2359042557363864e-05,
+      "clip_ratio/high_mean": 3.089760639340966e-06,
+      "clip_ratio/low_mean": 2.9356229674704082e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.244599008667137e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15171.0,
+      "completions/mean_length": 5908.671875,
+      "completions/mean_terminated_length": 5826.18896484375,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.9869658201932907,
+      "epoch": 0.15087396504139836,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006685085594654083,
+      "learning_rate": 1e-05,
+      "loss": -0.0036,
+      "num_tokens": 134507182.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999651908874512,
+      "sampling/importance_sampling_ratio/min": 0.0008160656434483826,
+      "sampling/sampling_logp_difference/max": 7.111015796661377,
+      "sampling/sampling_logp_difference/mean": 0.01997402310371399,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 8.511433406965807e-06,
+      "clip_ratio/high_mean": 2.1278583517414518e-06,
+      "clip_ratio/low_mean": 3.215114134036412e-05,
+      "clip_ratio/low_min": 3.941849627153715e-06,
+      "clip_ratio/region_mean": 3.427900014685292e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16000.0,
+      "completions/mean_length": 7188.0,
+      "completions/mean_terminated_length": 6735.7373046875,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.9519504383206367,
+      "epoch": 0.15179392824287027,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003854887094348669,
+      "learning_rate": 1e-05,
+      "loss": 0.0946,
+      "num_tokens": 135446382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998975992202759,
+      "sampling/importance_sampling_ratio/min": 0.0011354254093021154,
+      "sampling/sampling_logp_difference/max": 6.780747890472412,
+      "sampling/sampling_logp_difference/mean": 0.020226184278726578,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 7.114804702723632e-06,
+      "clip_ratio/high_mean": 1.778701175680908e-06,
+      "clip_ratio/low_mean": 1.9188738406228367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0967439695596113e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15712.0,
+      "completions/mean_length": 5843.5234375,
+      "completions/mean_terminated_length": 5676.21484375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "entropy": 0.9677107483148575,
+      "epoch": 0.15271389144434222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006265874952077866,
+      "learning_rate": 1e-05,
+      "loss": 0.0055,
+      "num_tokens": 136213233.0,
+      "reward": 0.296875,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999513626098633,
+      "sampling/importance_sampling_ratio/min": 0.002176719717681408,
+      "sampling/sampling_logp_difference/max": 6.129936218261719,
+      "sampling/sampling_logp_difference/mean": 0.021706756204366684,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 5.9216449699306395e-06,
+      "clip_ratio/high_mean": 1.4804112424826599e-06,
+      "clip_ratio/low_mean": 2.429895857858355e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.910307100341015e-06,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 6942.15625,
+      "completions/mean_terminated_length": 6637.58056640625,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 1.076062560081482,
+      "epoch": 0.15363385464581417,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0018056798726320267,
+      "learning_rate": 1e-05,
+      "loss": 0.0052,
+      "num_tokens": 137123405.0,
+      "reward": 0.2578125,
+      "reward_std": 0.172288179397583,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101161956787,
+      "sampling/importance_sampling_ratio/min": 0.022795137017965317,
+      "sampling/sampling_logp_difference/max": 3.781208038330078,
+      "sampling/sampling_logp_difference/mean": 0.02278529666364193,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 2.8421666684153024e-05,
+      "clip_ratio/high_mean": 8.364482027900522e-06,
+      "clip_ratio/low_mean": 4.042915224999888e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8793634050525725e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 7368.4609375,
+      "completions/mean_terminated_length": 7001.9755859375,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.9278362467885017,
+      "epoch": 0.1545538178472861,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002132089575752616,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "num_tokens": 138084464.0,
+      "reward": 0.421875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999951958656311,
+      "sampling/importance_sampling_ratio/min": 5.144971510162577e-05,
+      "sampling/sampling_logp_difference/max": 9.874905586242676,
+      "sampling/sampling_logp_difference/mean": 0.020028186962008476,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 6.84724363964051e-06,
+      "clip_ratio/high_mean": 1.7118109099101275e-06,
+      "clip_ratio/low_mean": 3.8177841361175524e-05,
+      "clip_ratio/low_min": 9.023873644764535e-06,
+      "clip_ratio/region_mean": 3.988965249845933e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 8278.578125,
+      "completions/mean_terminated_length": 8017.11279296875,
+      "completions/min_length": 1203.0,
+      "completions/min_terminated_length": 1203.0,
+      "entropy": 0.9731236174702644,
+      "epoch": 0.15547378104875806,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003180777421221137,
+      "learning_rate": 1e-05,
+      "loss": 0.0708,
+      "num_tokens": 139164722.0,
+      "reward": 0.296875,
+      "reward_std": 0.29143065214157104,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999357461929321,
+      "sampling/importance_sampling_ratio/min": 4.579544565785909e-06,
+      "sampling/sampling_logp_difference/max": 12.29391098022461,
+      "sampling/sampling_logp_difference/mean": 0.020700933411717415,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 2.3081439849192975e-05,
+      "clip_ratio/high_mean": 7.712801448178652e-06,
+      "clip_ratio/low_mean": 4.41923687048984e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.190517117625859e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 7237.2578125,
+      "completions/mean_terminated_length": 6865.43896484375,
+      "completions/min_length": 1078.0,
+      "completions/min_terminated_length": 1078.0,
+      "entropy": 0.7624354660511017,
+      "epoch": 0.15639374425023,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004162010736763477,
+      "learning_rate": 1e-05,
+      "loss": 0.0955,
+      "num_tokens": 140109163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.33903974294662476,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999967813491821,
+      "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05,
+      "sampling/sampling_logp_difference/max": 10.63192367553711,
+      "sampling/sampling_logp_difference/mean": 0.017928704619407654,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 1.8008577626460465e-05,
+      "clip_ratio/high_mean": 4.502144406615116e-06,
+      "clip_ratio/low_mean": 2.0606968291758676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.510911281206063e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15857.0,
+      "completions/mean_length": 7307.4296875,
+      "completions/mean_terminated_length": 7089.59228515625,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.9450376927852631,
+      "epoch": 0.15731370745170192,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003527693450450897,
+      "learning_rate": 1e-05,
+      "loss": 0.0442,
+      "num_tokens": 141063738.0,
+      "reward": 0.2890625,
+      "reward_std": 0.22673209011554718,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998871088027954,
+      "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05,
+      "sampling/sampling_logp_difference/max": 10.455191612243652,
+      "sampling/sampling_logp_difference/mean": 0.021664291620254517,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 1.9155178961227648e-05,
+      "clip_ratio/high_mean": 4.788794740306912e-06,
+      "clip_ratio/low_mean": 3.323748410366534e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.802627873028541e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 5985.9765625,
+      "completions/mean_terminated_length": 5736.42431640625,
+      "completions/min_length": 714.0,
+      "completions/min_terminated_length": 714.0,
+      "entropy": 0.8568939119577408,
+      "epoch": 0.15823367065317387,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002400327706709504,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "num_tokens": 141848599.0,
+      "reward": 0.4921875,
+      "reward_std": 0.1922685205936432,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999374151229858,
+      "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08,
+      "sampling/sampling_logp_difference/max": 18.115007400512695,
+      "sampling/sampling_logp_difference/mean": 0.018963739275932312,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 1.6673273876222083e-05,
+      "clip_ratio/high_mean": 4.978134711564053e-06,
+      "clip_ratio/low_mean": 4.1565862602510606e-05,
+      "clip_ratio/low_min": 6.89249168317474e-06,
+      "clip_ratio/region_mean": 4.654399640457996e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15111.0,
+      "completions/mean_length": 8078.8359375,
+      "completions/mean_terminated_length": 7810.92724609375,
+      "completions/min_length": 594.0,
+      "completions/min_terminated_length": 594.0,
+      "entropy": 1.0634759217500687,
+      "epoch": 0.15915363385464582,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003575773909687996,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 142902666.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999899864196777,
+      "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06,
+      "sampling/sampling_logp_difference/max": 13.205151557922363,
+      "sampling/sampling_logp_difference/mean": 0.021685753017663956,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 1.2325835996307433e-05,
+      "clip_ratio/high_mean": 3.081458999076858e-06,
+      "clip_ratio/low_mean": 4.288118509521155e-05,
+      "clip_ratio/low_min": 7.69851726545312e-06,
+      "clip_ratio/region_mean": 4.596264443534892e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15876.0,
+      "completions/mean_length": 8138.515625,
+      "completions/mean_terminated_length": 7588.81689453125,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 1.0329038575291634,
+      "epoch": 0.16007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003307635197415948,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 143967484.0,
+      "reward": 0.3203125,
+      "reward_std": 0.31800350546836853,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000429153442383,
+      "sampling/importance_sampling_ratio/min": 0.07909657061100006,
+      "sampling/sampling_logp_difference/max": 2.537085771560669,
+      "sampling/sampling_logp_difference/mean": 0.02233392372727394,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 2.3158392650657333e-05,
+      "clip_ratio/high_mean": 5.789598162664333e-06,
+      "clip_ratio/low_mean": 3.4071419804604375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.986101773989503e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 8144.21875,
+      "completions/mean_terminated_length": 7878.4189453125,
+      "completions/min_length": 828.0,
+      "completions/min_terminated_length": 828.0,
+      "entropy": 0.9547601044178009,
+      "epoch": 0.1609935602575897,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022392498794943094,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 145028608.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411096513271332,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473094940186,
+      "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06,
+      "sampling/sampling_logp_difference/max": 12.749860763549805,
+      "sampling/sampling_logp_difference/mean": 0.0203234925866127,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 1.330557415712974e-05,
+      "clip_ratio/high_mean": 3.326393539282435e-06,
+      "clip_ratio/low_mean": 3.57260964847228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.905248979663156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16327.0,
+      "completions/mean_length": 6289.40625,
+      "completions/mean_terminated_length": 6129.1748046875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9483931511640549,
+      "epoch": 0.16191352345906163,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005551324691623449,
+      "learning_rate": 1e-05,
+      "loss": 0.085,
+      "num_tokens": 145851292.0,
+      "reward": 0.484375,
+      "reward_std": 0.327729195356369,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999369382858276,
+      "sampling/importance_sampling_ratio/min": 0.0024864254519343376,
+      "sampling/sampling_logp_difference/max": 5.996909141540527,
+      "sampling/sampling_logp_difference/mean": 0.020259611308574677,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 8.344215302713565e-06,
+      "clip_ratio/high_mean": 2.086053825678391e-06,
+      "clip_ratio/low_mean": 5.073524926046957e-05,
+      "clip_ratio/low_min": 2.859953838196816e-06,
+      "clip_ratio/region_mean": 5.282130268824403e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16176.0,
+      "completions/mean_length": 8855.9296875,
+      "completions/mean_terminated_length": 8354.05859375,
+      "completions/min_length": 1004.0,
+      "completions/min_terminated_length": 1004.0,
+      "entropy": 1.003264345228672,
+      "epoch": 0.16283348666053357,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038497373461723328,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 147004723.0,
+      "reward": 0.2890625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000008344650269,
+      "sampling/importance_sampling_ratio/min": 0.0003718819934874773,
+      "sampling/sampling_logp_difference/max": 7.8969340324401855,
+      "sampling/sampling_logp_difference/mean": 0.02178027108311653,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 1.2368503575999057e-05,
+      "clip_ratio/high_mean": 3.0921258939997642e-06,
+      "clip_ratio/low_mean": 4.947490833728807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.256703434497467e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16199.0,
+      "completions/mean_length": 7574.3359375,
+      "completions/mean_terminated_length": 7434.50048828125,
+      "completions/min_length": 856.0,
+      "completions/min_terminated_length": 856.0,
+      "entropy": 0.9448538422584534,
+      "epoch": 0.16375344986200552,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005027150269597769,
+      "learning_rate": 1e-05,
+      "loss": 0.054,
+      "num_tokens": 147996190.0,
+      "reward": 0.359375,
+      "reward_std": 0.3316858410835266,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 2.846284814950195e-06,
+      "sampling/sampling_logp_difference/max": 12.769495964050293,
+      "sampling/sampling_logp_difference/mean": 0.020686112344264984,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 1.6756753666413715e-05,
+      "clip_ratio/high_mean": 4.189188416603429e-06,
+      "clip_ratio/low_mean": 3.363430948866153e-05,
+      "clip_ratio/low_min": 3.5745945297094295e-06,
+      "clip_ratio/region_mean": 3.7823498018951796e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15821.0,
+      "completions/mean_length": 7162.5625,
+      "completions/mean_terminated_length": 6787.70703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.8928515017032623,
+      "epoch": 0.16467341306347746,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325182662345469,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 148931006.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3492894768714905,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07,
+      "sampling/sampling_logp_difference/max": 15.537620544433594,
+      "sampling/sampling_logp_difference/mean": 0.02043815702199936,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 2.08163191928179e-05,
+      "clip_ratio/high_mean": 5.204079798204475e-06,
+      "clip_ratio/low_mean": 2.8009484594804235e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3213564165635034e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16163.0,
+      "completions/mean_length": 7958.2109375,
+      "completions/mean_terminated_length": 7396.4921875,
+      "completions/min_length": 809.0,
+      "completions/min_terminated_length": 809.0,
+      "entropy": 0.8763524517416954,
+      "epoch": 0.1655933762649494,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003250610316172242,
+      "learning_rate": 1e-05,
+      "loss": 0.0388,
+      "num_tokens": 149968481.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999974250793457,
+      "sampling/importance_sampling_ratio/min": 1.370981294712692e-06,
+      "sampling/sampling_logp_difference/max": 13.499983787536621,
+      "sampling/sampling_logp_difference/mean": 0.020478684455156326,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 1.4398233361134771e-05,
+      "clip_ratio/high_mean": 4.918068043480162e-06,
+      "clip_ratio/low_mean": 1.937760777082076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4295676269048272e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15728.0,
+      "completions/mean_length": 6120.296875,
+      "completions/mean_terminated_length": 5789.20947265625,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7507334873080254,
+      "epoch": 0.16651333946642136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004865634720772505,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 150768791.0,
+      "reward": 0.5703125,
+      "reward_std": 0.24671241641044617,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999062418937683,
+      "sampling/importance_sampling_ratio/min": 3.535915311658755e-05,
+      "sampling/sampling_logp_difference/max": 10.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.01739395596086979,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 1.170663267657801e-05,
+      "clip_ratio/high_mean": 2.9266581691445026e-06,
+      "clip_ratio/low_mean": 5.480891331899329e-05,
+      "clip_ratio/low_min": 9.078275525098434e-06,
+      "clip_ratio/region_mean": 5.773557131760754e-05,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 8231.671875,
+      "completions/mean_terminated_length": 7230.5087890625,
+      "completions/min_length": 1231.0,
+      "completions/min_terminated_length": 1231.0,
+      "entropy": 0.8613645136356354,
+      "epoch": 0.16743330266789327,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0027805580757558346,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 151844301.0,
+      "reward": 0.34375,
+      "reward_std": 0.35088711977005005,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957799911499,
+      "sampling/importance_sampling_ratio/min": 0.0015732402680441737,
+      "sampling/sampling_logp_difference/max": 6.454617977142334,
+      "sampling/sampling_logp_difference/mean": 0.019971080124378204,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 1.0858868336072192e-05,
+      "clip_ratio/high_mean": 2.714717084018048e-06,
+      "clip_ratio/low_mean": 4.333486742780224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.60495848528808e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15682.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6038.4921875,
+      "completions/mean_terminated_length": 6038.4921875,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.8801494240760803,
+      "epoch": 0.16835326586936522,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0028903940692543983,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "num_tokens": 152638356.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3022122383117676,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999019503593445,
+      "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06,
+      "sampling/sampling_logp_difference/max": 12.374916076660156,
+      "sampling/sampling_logp_difference/mean": 0.019382324069738388,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 7.320573104152572e-06,
+      "clip_ratio/high_mean": 1.830143276038143e-06,
+      "clip_ratio/low_mean": 4.994629193788569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.177643492970674e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16235.0,
+      "completions/mean_length": 7506.921875,
+      "completions/mean_terminated_length": 7070.34375,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.8713229671120644,
+      "epoch": 0.16927322907083717,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0029546513687819242,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 153618418.0,
+      "reward": 0.3828125,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000388622283936,
+      "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07,
+      "sampling/sampling_logp_difference/max": 14.616228103637695,
+      "sampling/sampling_logp_difference/mean": 0.01928526908159256,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 1.2992590200155973e-05,
+      "clip_ratio/high_mean": 3.2481475500389934e-06,
+      "clip_ratio/low_mean": 2.8494011758084525e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.174215930812352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13999.0,
+      "completions/mean_length": 6725.921875,
+      "completions/mean_terminated_length": 6649.8740234375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "entropy": 0.9011344686150551,
+      "epoch": 0.1701931922723091,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002384800696745515,
+      "learning_rate": 1e-05,
+      "loss": 0.0837,
+      "num_tokens": 154502440.0,
+      "reward": 0.46875,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999340772628784,
+      "sampling/importance_sampling_ratio/min": 0.0008398547652177513,
+      "sampling/sampling_logp_difference/max": 7.082281589508057,
+      "sampling/sampling_logp_difference/mean": 0.020737573504447937,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 2.686360085135675e-05,
+      "clip_ratio/high_mean": 7.414224342028319e-06,
+      "clip_ratio/low_mean": 3.7723172567893926e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5137397364669596e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15862.0,
+      "completions/mean_length": 7285.78125,
+      "completions/mean_terminated_length": 6992.2900390625,
+      "completions/min_length": 1176.0,
+      "completions/min_terminated_length": 1176.0,
+      "entropy": 1.028538629412651,
+      "epoch": 0.17111315547378106,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033664393704384565,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 155454988.0,
+      "reward": 0.296875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.0003808041801676154,
+      "sampling/sampling_logp_difference/max": 7.873225212097168,
+      "sampling/sampling_logp_difference/mean": 0.022076331079006195,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 1.1637549050647067e-05,
+      "clip_ratio/high_mean": 2.9093872626617667e-06,
+      "clip_ratio/low_mean": 3.613749231590191e-05,
+      "clip_ratio/low_min": 6.27866324975912e-06,
+      "clip_ratio/region_mean": 3.904687946487684e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 7546.1015625,
+      "completions/mean_terminated_length": 6956.90869140625,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.9216663613915443,
+      "epoch": 0.17203311867525298,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029569920152425766,
+      "learning_rate": 1e-05,
+      "loss": 0.0995,
+      "num_tokens": 156439609.0,
+      "reward": 0.390625,
+      "reward_std": 0.305637001991272,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999749660491943,
+      "sampling/importance_sampling_ratio/min": 0.009956372901797295,
+      "sampling/sampling_logp_difference/max": 4.609542369842529,
+      "sampling/sampling_logp_difference/mean": 0.021088771522045135,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 6.485023732238915e-06,
+      "clip_ratio/high_mean": 1.6212559330597287e-06,
+      "clip_ratio/low_mean": 1.9624552805908024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1245808738967753e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16271.0,
+      "completions/mean_length": 6866.6015625,
+      "completions/mean_terminated_length": 6791.66162109375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.9553637430071831,
+      "epoch": 0.17295308187672492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023973705247044563,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 157343374.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 7.46340665500611e-05,
+      "sampling/sampling_logp_difference/max": 9.502913475036621,
+      "sampling/sampling_logp_difference/mean": 0.021616388112306595,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 9.11087408894673e-06,
+      "clip_ratio/high_mean": 2.2777185222366825e-06,
+      "clip_ratio/low_mean": 3.832016966498486e-05,
+      "clip_ratio/low_min": 5.240211066848133e-06,
+      "clip_ratio/region_mean": 4.059788818722154e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14284.0,
+      "completions/mean_length": 6335.9453125,
+      "completions/mean_terminated_length": 5754.65283203125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.8574290797114372,
+      "epoch": 0.17387304507819687,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023072708863765,
+      "learning_rate": 1e-05,
+      "loss": 0.0056,
+      "num_tokens": 158173719.0,
+      "reward": 0.4140625,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998940229415894,
+      "sampling/importance_sampling_ratio/min": 0.0001612449559615925,
+      "sampling/sampling_logp_difference/max": 8.732585906982422,
+      "sampling/sampling_logp_difference/mean": 0.018506702035665512,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 3.0578403084291494e-05,
+      "clip_ratio/high_mean": 9.993626633786334e-06,
+      "clip_ratio/low_mean": 5.610333710137638e-05,
+      "clip_ratio/low_min": 1.3168393707019277e-05,
+      "clip_ratio/region_mean": 6.609696265513776e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 8239.8984375,
+      "completions/mean_terminated_length": 7768.751953125,
+      "completions/min_length": 1080.0,
+      "completions/min_terminated_length": 1080.0,
+      "entropy": 0.8983379155397415,
+      "epoch": 0.17479300827966882,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004400993697345257,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 159248410.0,
+      "reward": 0.3125,
+      "reward_std": 0.32325342297554016,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998342990875244,
+      "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06,
+      "sampling/sampling_logp_difference/max": 13.686293601989746,
+      "sampling/sampling_logp_difference/mean": 0.02096184343099594,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 9.026573934534099e-06,
+      "clip_ratio/high_mean": 2.2566434836335247e-06,
+      "clip_ratio/low_mean": 6.66748674120754e-05,
+      "clip_ratio/low_min": 1.5295650428015506e-05,
+      "clip_ratio/region_mean": 6.89315111230826e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13741.0,
+      "completions/mean_length": 6580.921875,
+      "completions/mean_terminated_length": 5659.26513671875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8277688398957253,
+      "epoch": 0.17571297148114076,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00661451555788517,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 160109904.0,
+      "reward": 0.484375,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 0.00036075623938813806,
+      "sampling/sampling_logp_difference/max": 7.927308082580566,
+      "sampling/sampling_logp_difference/mean": 0.017984790727496147,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 7.435806082867202e-06,
+      "clip_ratio/high_mean": 1.8589515207168006e-06,
+      "clip_ratio/low_mean": 4.045673085784074e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2315682549087796e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 7627.0,
+      "completions/mean_terminated_length": 7416.83251953125,
+      "completions/min_length": 1916.0,
+      "completions/min_terminated_length": 1916.0,
+      "entropy": 0.8832443356513977,
+      "epoch": 0.1766329346826127,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004417019430547953,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 161103384.0,
+      "reward": 0.40625,
+      "reward_std": 0.3634909689426422,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998986721038818,
+      "sampling/importance_sampling_ratio/min": 4.833659477299079e-05,
+      "sampling/sampling_logp_difference/max": 9.937321662902832,
+      "sampling/sampling_logp_difference/mean": 0.01947963796555996,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 9.941184316630824e-06,
+      "clip_ratio/high_mean": 2.485296079157706e-06,
+      "clip_ratio/low_mean": 2.6134909091979353e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8620205910101504e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 8426.1015625,
+      "completions/mean_terminated_length": 7965.72705078125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8188603445887566,
+      "epoch": 0.17755289788408463,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0030983765609562397,
+      "learning_rate": 1e-05,
+      "loss": 0.0426,
+      "num_tokens": 162199765.0,
+      "reward": 0.25,
+      "reward_std": 0.2540663480758667,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999411106109619,
+      "sampling/importance_sampling_ratio/min": 0.0009119694004766643,
+      "sampling/sampling_logp_difference/max": 6.999904155731201,
+      "sampling/sampling_logp_difference/mean": 0.02070600539445877,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 2.612139087432297e-05,
+      "clip_ratio/high_mean": 6.530347718580742e-06,
+      "clip_ratio/low_mean": 3.7853451885894174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.438379949078808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15904.0,
+      "completions/mean_length": 7154.2109375,
+      "completions/mean_terminated_length": 6856.4755859375,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "entropy": 0.9913735538721085,
+      "epoch": 0.17847286108555657,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003430198412388563,
+      "learning_rate": 1e-05,
+      "loss": 0.052,
+      "num_tokens": 163133232.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2120065689086914,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000275373458862,
+      "sampling/importance_sampling_ratio/min": 0.00042929715709760785,
+      "sampling/sampling_logp_difference/max": 7.753361225128174,
+      "sampling/sampling_logp_difference/mean": 0.02190260961651802,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 3.1841454983805306e-06,
+      "clip_ratio/high_mean": 7.960363745951327e-07,
+      "clip_ratio/low_mean": 3.384581600585079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4641852380445926e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 7693.1328125,
+      "completions/mean_terminated_length": 7412.7822265625,
+      "completions/min_length": 1077.0,
+      "completions/min_terminated_length": 1077.0,
+      "entropy": 0.9887127950787544,
+      "epoch": 0.17939282428702852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002780586015433073,
+      "learning_rate": 1e-05,
+      "loss": 0.0449,
+      "num_tokens": 164134393.0,
+      "reward": 0.3515625,
+      "reward_std": 0.20411095023155212,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999028444290161,
+      "sampling/importance_sampling_ratio/min": 3.559096626304381e-07,
+      "sampling/sampling_logp_difference/max": 14.848588943481445,
+      "sampling/sampling_logp_difference/mean": 0.021110571920871735,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 9.770586984814145e-06,
+      "clip_ratio/high_mean": 5.008155312680174e-06,
+      "clip_ratio/low_mean": 5.182203130971175e-05,
+      "clip_ratio/low_min": 1.5574546068819473e-05,
+      "clip_ratio/region_mean": 5.683018616764457e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16129.0,
+      "completions/mean_length": 7072.1484375,
+      "completions/mean_terminated_length": 6771.76611328125,
+      "completions/min_length": 695.0,
+      "completions/min_terminated_length": 695.0,
+      "entropy": 0.861792616546154,
+      "epoch": 0.18031278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030156150460243225,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 165063412.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998926520347595,
+      "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06,
+      "sampling/sampling_logp_difference/max": 12.999247550964355,
+      "sampling/sampling_logp_difference/mean": 0.019325289875268936,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 2.2510209873871645e-05,
+      "clip_ratio/high_mean": 6.455301331698138e-06,
+      "clip_ratio/low_mean": 6.156819108582567e-05,
+      "clip_ratio/low_min": 5.763157332694391e-06,
+      "clip_ratio/region_mean": 6.802349253121065e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15062.0,
+      "completions/mean_length": 7353.421875,
+      "completions/mean_terminated_length": 7062.11279296875,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "entropy": 0.8961873054504395,
+      "epoch": 0.1812327506899724,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034921523183584213,
+      "learning_rate": 1e-05,
+      "loss": 0.0161,
+      "num_tokens": 166024306.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2909066081047058,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999784231185913,
+      "sampling/importance_sampling_ratio/min": 0.0005124400486238301,
+      "sampling/sampling_logp_difference/max": 7.576326847076416,
+      "sampling/sampling_logp_difference/mean": 0.019593238830566406,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 1.3040991007073899e-05,
+      "clip_ratio/high_mean": 4.292725350296678e-06,
+      "clip_ratio/low_mean": 5.347559840629401e-05,
+      "clip_ratio/low_min": 6.613406640099129e-06,
+      "clip_ratio/region_mean": 5.776832381343411e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15604.0,
+      "completions/mean_length": 7348.03125,
+      "completions/mean_terminated_length": 6903.63916015625,
+      "completions/min_length": 1619.0,
+      "completions/min_terminated_length": 1619.0,
+      "entropy": 0.824029266834259,
+      "epoch": 0.18215271389144433,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0027784397825598717,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 166984982.0,
+      "reward": 0.40625,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 0.0010020677000284195,
+      "sampling/sampling_logp_difference/max": 6.905689716339111,
+      "sampling/sampling_logp_difference/mean": 0.01857386901974678,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 3.330808067403268e-05,
+      "clip_ratio/high_mean": 1.0969530649163062e-05,
+      "clip_ratio/low_mean": 3.2080681648949394e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3050211388617754e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16358.0,
+      "completions/mean_length": 7290.4765625,
+      "completions/mean_terminated_length": 6920.82080078125,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8884479627013206,
+      "epoch": 0.18307267709291627,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004110465291887522,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 167936971.0,
+      "reward": 0.4375,
+      "reward_std": 0.35901516675949097,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493360519409,
+      "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06,
+      "sampling/sampling_logp_difference/max": 13.219663619995117,
+      "sampling/sampling_logp_difference/mean": 0.019696572795510292,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 9.77357763076725e-06,
+      "clip_ratio/high_mean": 2.4433944076918124e-06,
+      "clip_ratio/low_mean": 3.466498992565903e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.710838473125477e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15824.0,
+      "completions/mean_length": 7803.625,
+      "completions/mean_terminated_length": 6833.66943359375,
+      "completions/min_length": 929.0,
+      "completions/min_terminated_length": 929.0,
+      "entropy": 0.8326860442757607,
+      "epoch": 0.18399264029438822,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002410614863038063,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "num_tokens": 168955683.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29538238048553467,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999977707862854,
+      "sampling/importance_sampling_ratio/min": 0.0008801451185718179,
+      "sampling/sampling_logp_difference/max": 7.035423755645752,
+      "sampling/sampling_logp_difference/mean": 0.018545793369412422,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 1.4602125929741305e-05,
+      "clip_ratio/high_mean": 3.6505314824353263e-06,
+      "clip_ratio/low_mean": 3.4781527119776e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8432058772741584e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6804.34375,
+      "completions/mean_terminated_length": 6495.322265625,
+      "completions/min_length": 645.0,
+      "completions/min_terminated_length": 645.0,
+      "entropy": 0.9669496119022369,
+      "epoch": 0.18491260349586017,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034376555122435093,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 169845823.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31534504890441895,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 1.767780588579626e-08,
+      "sampling/sampling_logp_difference/max": 17.850955963134766,
+      "sampling/sampling_logp_difference/mean": 0.020515555515885353,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 1.5814722473805887e-05,
+      "clip_ratio/high_mean": 3.953680618451472e-06,
+      "clip_ratio/low_mean": 3.574208744794305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9695768407455034e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16350.0,
+      "completions/mean_length": 6827.9609375,
+      "completions/mean_terminated_length": 6105.23583984375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "entropy": 0.8833946585655212,
+      "epoch": 0.1858325666973321,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0026675171684473753,
+      "learning_rate": 1e-05,
+      "loss": 0.1046,
+      "num_tokens": 170738210.0,
+      "reward": 0.421875,
+      "reward_std": 0.2698654532432556,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000019907951355,
+      "sampling/importance_sampling_ratio/min": 0.002906275913119316,
+      "sampling/sampling_logp_difference/max": 5.840882778167725,
+      "sampling/sampling_logp_difference/mean": 0.019948139786720276,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 1.6623121837255894e-05,
+      "clip_ratio/high_mean": 4.1557804593139736e-06,
+      "clip_ratio/low_mean": 6.462372630267055e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.877950727357529e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15725.0,
+      "completions/mean_length": 7377.984375,
+      "completions/mean_terminated_length": 7307.07080078125,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.8881714344024658,
+      "epoch": 0.18675252989880406,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0039620306342840195,
+      "learning_rate": 1e-05,
+      "loss": 0.034,
+      "num_tokens": 171705152.0,
+      "reward": 0.3359375,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999995231628418,
+      "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05,
+      "sampling/sampling_logp_difference/max": 10.614632606506348,
+      "sampling/sampling_logp_difference/mean": 0.01964445412158966,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 9.639111340220552e-06,
+      "clip_ratio/high_mean": 2.409777835055138e-06,
+      "clip_ratio/low_mean": 2.775239624952519e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0162174198267167e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15265.0,
+      "completions/mean_length": 6051.8828125,
+      "completions/mean_terminated_length": 5543.74560546875,
+      "completions/min_length": 819.0,
+      "completions/min_terminated_length": 819.0,
+      "entropy": 0.8851477280259132,
+      "epoch": 0.18767249310027598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0040458571165800095,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 172501881.0,
+      "reward": 0.4296875,
+      "reward_std": 0.16781240701675415,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999410510063171,
+      "sampling/importance_sampling_ratio/min": 0.0021976607386022806,
+      "sampling/sampling_logp_difference/max": 6.120361804962158,
+      "sampling/sampling_logp_difference/mean": 0.01957303285598755,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 9.72708312474424e-06,
+      "clip_ratio/high_mean": 3.529455852913088e-06,
+      "clip_ratio/low_mean": 5.158422732165491e-05,
+      "clip_ratio/low_min": 1.1939961495954776e-05,
+      "clip_ratio/region_mean": 5.5113683174567996e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7830.171875,
+      "completions/mean_terminated_length": 7409.4912109375,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.9070459827780724,
+      "epoch": 0.18859245630174792,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005941574461758137,
+      "learning_rate": 1e-05,
+      "loss": 0.0427,
+      "num_tokens": 173522391.0,
+      "reward": 0.34375,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000017881393433,
+      "sampling/importance_sampling_ratio/min": 0.00011712420382536948,
+      "sampling/sampling_logp_difference/max": 9.052275657653809,
+      "sampling/sampling_logp_difference/mean": 0.021295130252838135,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 5.5543214330100454e-06,
+      "clip_ratio/high_mean": 1.3885803582525114e-06,
+      "clip_ratio/low_mean": 1.718775109793569e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8576331683561875e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15443.0,
+      "completions/mean_length": 7520.6796875,
+      "completions/mean_terminated_length": 6769.55078125,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.8843575045466423,
+      "epoch": 0.18951241950321987,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0025851845275610685,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 174504534.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 0.00039556476986035705,
+      "sampling/sampling_logp_difference/max": 7.835196018218994,
+      "sampling/sampling_logp_difference/mean": 0.02016005665063858,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 1.0145481155632297e-05,
+      "clip_ratio/high_mean": 2.536370288908074e-06,
+      "clip_ratio/low_mean": 3.617897255026037e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.871534295285528e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16173.0,
+      "completions/mean_length": 7382.1875,
+      "completions/mean_terminated_length": 6861.42138671875,
+      "completions/min_length": 934.0,
+      "completions/min_terminated_length": 934.0,
+      "entropy": 0.916313610970974,
+      "epoch": 0.19043238270469182,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004170550964772701,
+      "learning_rate": 1e-05,
+      "loss": 0.047,
+      "num_tokens": 175472574.0,
+      "reward": 0.46875,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999932646751404,
+      "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05,
+      "sampling/sampling_logp_difference/max": 10.481352806091309,
+      "sampling/sampling_logp_difference/mean": 0.020749717950820923,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.83663013963087e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.83663013963087e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13891.0,
+      "completions/mean_length": 6122.453125,
+      "completions/mean_terminated_length": 6041.6533203125,
+      "completions/min_length": 1192.0,
+      "completions/min_terminated_length": 1192.0,
+      "entropy": 0.8984386026859283,
+      "epoch": 0.19135234590616376,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 176275568.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 7.88934721640544e-06,
+      "sampling/sampling_logp_difference/max": 11.74999713897705,
+      "sampling/sampling_logp_difference/mean": 0.020278753712773323,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 1.4535152331518475e-05,
+      "clip_ratio/high_mean": 3.6337880828796187e-06,
+      "clip_ratio/low_mean": 4.3961883989140915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7595671958333696e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15547.0,
+      "completions/mean_length": 4983.2890625,
+      "completions/mean_terminated_length": 4709.67236328125,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.825260303914547,
+      "epoch": 0.1922723091076357,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004848882555961609,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 176932549.0,
+      "reward": 0.6484375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999616146087646,
+      "sampling/importance_sampling_ratio/min": 1.626804078114219e-05,
+      "sampling/sampling_logp_difference/max": 11.026308059692383,
+      "sampling/sampling_logp_difference/mean": 0.017959970980882645,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 1.1141860795760294e-05,
+      "clip_ratio/high_mean": 2.7854651989400736e-06,
+      "clip_ratio/low_mean": 4.2418692146384274e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5204157913758536e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15415.0,
+      "completions/mean_length": 5766.5234375,
+      "completions/mean_terminated_length": 5511.7041015625,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.9016259610652924,
+      "epoch": 0.19319227230910763,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004749474115669727,
+      "learning_rate": 1e-05,
+      "loss": 0.0977,
+      "num_tokens": 177691752.0,
+      "reward": 0.5,
+      "reward_std": 0.2738044261932373,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000141859054565,
+      "sampling/importance_sampling_ratio/min": 8.927558155846782e-06,
+      "sampling/sampling_logp_difference/max": 11.626367568969727,
+      "sampling/sampling_logp_difference/mean": 0.019118282943964005,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 5.5243735914700665e-06,
+      "clip_ratio/high_mean": 2.1587275114143267e-06,
+      "clip_ratio/low_mean": 4.609663824339805e-05,
+      "clip_ratio/low_min": 3.983555870945565e-06,
+      "clip_ratio/region_mean": 4.8255366664307076e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15696.0,
+      "completions/mean_length": 6993.671875,
+      "completions/mean_terminated_length": 6768.30419921875,
+      "completions/min_length": 889.0,
+      "completions/min_terminated_length": 889.0,
+      "entropy": 0.9074988812208176,
+      "epoch": 0.19411223551057957,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004418120253831148,
+      "learning_rate": 1e-05,
+      "loss": 0.1135,
+      "num_tokens": 178603454.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000037670135498,
+      "sampling/importance_sampling_ratio/min": 0.0018135923892259598,
+      "sampling/sampling_logp_difference/max": 6.312445640563965,
+      "sampling/sampling_logp_difference/mean": 0.01957814022898674,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 5.126943051436683e-06,
+      "clip_ratio/high_mean": 1.2817357628591708e-06,
+      "clip_ratio/low_mean": 2.7488794444252562e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.877053032079857e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 7445.1328125,
+      "completions/mean_terminated_length": 6849.20849609375,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9255013465881348,
+      "epoch": 0.19503219871205152,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00237120408564806,
+      "learning_rate": 1e-05,
+      "loss": 0.0172,
+      "num_tokens": 179577063.0,
+      "reward": 0.40625,
+      "reward_std": 0.21040897071361542,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999725818634033,
+      "sampling/importance_sampling_ratio/min": 9.651589061832055e-05,
+      "sampling/sampling_logp_difference/max": 9.245802879333496,
+      "sampling/sampling_logp_difference/mean": 0.02165937051177025,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 1.8956294752570102e-05,
+      "clip_ratio/high_mean": 4.7390736881425255e-06,
+      "clip_ratio/low_mean": 2.6486316301088664e-05,
+      "clip_ratio/low_min": 3.516273409331916e-06,
+      "clip_ratio/region_mean": 3.122539010291803e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 6120.5546875,
+      "completions/mean_terminated_length": 5703.34130859375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.8181199952960014,
+      "epoch": 0.19595216191352346,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004715202376246452,
+      "learning_rate": 1e-05,
+      "loss": 0.1291,
+      "num_tokens": 180380422.0,
+      "reward": 0.5,
+      "reward_std": 0.29355230927467346,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999874472618103,
+      "sampling/importance_sampling_ratio/min": 0.004350374918431044,
+      "sampling/sampling_logp_difference/max": 5.437493324279785,
+      "sampling/sampling_logp_difference/mean": 0.018377620726823807,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 5.594843969447538e-06,
+      "clip_ratio/high_mean": 2.376495558564784e-06,
+      "clip_ratio/low_mean": 3.4097628713425365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6474124044616474e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16005.0,
+      "completions/mean_length": 6351.203125,
+      "completions/mean_terminated_length": 5857.78662109375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.8798654451966286,
+      "epoch": 0.1968721251149954,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003063712501898408,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 181212776.0,
+      "reward": 0.453125,
+      "reward_std": 0.3048579692840576,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999946355819702,
+      "sampling/importance_sampling_ratio/min": 7.891544555604924e-06,
+      "sampling/sampling_logp_difference/max": 11.74971866607666,
+      "sampling/sampling_logp_difference/mean": 0.019523698836565018,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.544438988001275e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.544438988001275e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14180.0,
+      "completions/mean_length": 6330.046875,
+      "completions/mean_terminated_length": 6170.46044921875,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 0.8319354206323624,
+      "epoch": 0.19779208831646733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033194730058312416,
+      "learning_rate": 1e-05,
+      "loss": 0.0924,
+      "num_tokens": 182041910.0,
+      "reward": 0.453125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998994469642639,
+      "sampling/importance_sampling_ratio/min": 0.00010535263572819531,
+      "sampling/sampling_logp_difference/max": 9.158197402954102,
+      "sampling/sampling_logp_difference/mean": 0.018981872126460075,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.7156292415165808e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.7156292415165808e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15982.0,
+      "completions/mean_length": 6665.2890625,
+      "completions/mean_terminated_length": 6351.7822265625,
+      "completions/min_length": 722.0,
+      "completions/min_terminated_length": 722.0,
+      "entropy": 0.9336326420307159,
+      "epoch": 0.19871205151793928,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004492956213653088,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 182914843.0,
+      "reward": 0.3828125,
+      "reward_std": 0.14807432889938354,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000030279159546,
+      "sampling/importance_sampling_ratio/min": 0.011399568989872932,
+      "sampling/sampling_logp_difference/max": 4.474179744720459,
+      "sampling/sampling_logp_difference/mean": 0.02088768407702446,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 3.2495465802639956e-05,
+      "clip_ratio/high_mean": 9.084843100026774e-06,
+      "clip_ratio/low_mean": 5.4809036328151706e-05,
+      "clip_ratio/low_min": 8.953898031904828e-06,
+      "clip_ratio/region_mean": 6.389387954186532e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16064.0,
+      "completions/mean_length": 5393.9140625,
+      "completions/mean_terminated_length": 5039.39501953125,
+      "completions/min_length": 628.0,
+      "completions/min_terminated_length": 628.0,
+      "entropy": 0.7864786610007286,
+      "epoch": 0.19963201471941122,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003816079581156373,
+      "learning_rate": 1e-05,
+      "loss": -0.004,
+      "num_tokens": 183628152.0,
+      "reward": 0.546875,
+      "reward_std": 0.31694266200065613,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998779892921448,
+      "sampling/importance_sampling_ratio/min": 0.003246711567044258,
+      "sampling/sampling_logp_difference/max": 5.730112552642822,
+      "sampling/sampling_logp_difference/mean": 0.018448319286108017,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 8.638648068881594e-06,
+      "clip_ratio/high_mean": 2.1596620172203984e-06,
+      "clip_ratio/low_mean": 1.6896704778446292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9056366909353528e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15453.0,
+      "completions/mean_length": 7161.5,
+      "completions/mean_terminated_length": 7015.111328125,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "entropy": 0.915394201874733,
+      "epoch": 0.20055197792088317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003666195785626769,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 184562352.0,
+      "reward": 0.3671875,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00025550799909979105,
+      "sampling/sampling_logp_difference/max": 8.272256851196289,
+      "sampling/sampling_logp_difference/mean": 0.019755780696868896,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 6.424931598303374e-06,
+      "clip_ratio/high_mean": 1.6062328995758435e-06,
+      "clip_ratio/low_mean": 2.49038239417132e-05,
+      "clip_ratio/low_min": 4.00025601265952e-06,
+      "clip_ratio/region_mean": 2.651005689813246e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15408.0,
+      "completions/mean_length": 7957.671875,
+      "completions/mean_terminated_length": 7685.8544921875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "entropy": 1.1176252663135529,
+      "epoch": 0.2014719411223551,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025940234772861004,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 185606670.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.1171875,
+      "rewards/accuracy_reward/std": 0.322907418012619,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999893844127655,
+      "sampling/importance_sampling_ratio/min": 0.0007622809498570859,
+      "sampling/sampling_logp_difference/max": 7.179195404052734,
+      "sampling/sampling_logp_difference/mean": 0.02338646724820137,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 1.9903963220713194e-05,
+      "clip_ratio/high_mean": 5.829163114867697e-06,
+      "clip_ratio/low_mean": 4.4742550926457625e-05,
+      "clip_ratio/low_min": 3.5803282116830815e-06,
+      "clip_ratio/region_mean": 5.057171370026481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 7060.6640625,
+      "completions/mean_terminated_length": 6759.9111328125,
+      "completions/min_length": 1460.0,
+      "completions/min_terminated_length": 1460.0,
+      "entropy": 0.9148540124297142,
+      "epoch": 0.20239190432382706,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004315398633480072,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 186526883.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0004585353017318994,
+      "sampling/sampling_logp_difference/max": 7.687473297119141,
+      "sampling/sampling_logp_difference/mean": 0.01967843994498253,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 1.147099328591139e-05,
+      "clip_ratio/high_mean": 2.8677483214778476e-06,
+      "clip_ratio/low_mean": 2.8967988555450574e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1835736763241584e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15596.0,
+      "completions/mean_length": 6649.6640625,
+      "completions/mean_terminated_length": 6416.04052734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9298559054732323,
+      "epoch": 0.20331186752529898,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030786178540438414,
+      "learning_rate": 1e-05,
+      "loss": 0.0606,
+      "num_tokens": 187397536.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000005841255188,
+      "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07,
+      "sampling/sampling_logp_difference/max": 14.929608345031738,
+      "sampling/sampling_logp_difference/mean": 0.020215414464473724,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 2.2768570943298982e-05,
+      "clip_ratio/high_mean": 5.692142735824746e-06,
+      "clip_ratio/low_mean": 3.249637484259438e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8188517464732286e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 8292.015625,
+      "completions/mean_terminated_length": 7823.8837890625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.8232023045420647,
+      "epoch": 0.20423183072677092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002438523108139634,
+      "learning_rate": 1e-05,
+      "loss": 0.044,
+      "num_tokens": 188477778.0,
+      "reward": 0.328125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000240802764893,
+      "sampling/importance_sampling_ratio/min": 0.005636279005557299,
+      "sampling/sampling_logp_difference/max": 5.178531169891357,
+      "sampling/sampling_logp_difference/mean": 0.018984414637088776,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 2.0840709566982696e-05,
+      "clip_ratio/high_mean": 6.135253556749376e-06,
+      "clip_ratio/low_mean": 2.255633432923787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.869158777230041e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15991.0,
+      "completions/mean_length": 7600.9765625,
+      "completions/mean_terminated_length": 6936.71484375,
+      "completions/min_length": 995.0,
+      "completions/min_terminated_length": 995.0,
+      "entropy": 0.8689917623996735,
+      "epoch": 0.20515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004773247055709362,
+      "learning_rate": 1e-05,
+      "loss": 0.0486,
+      "num_tokens": 189470655.0,
+      "reward": 0.40625,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 0.001327168894931674,
+      "sampling/sampling_logp_difference/max": 6.624707221984863,
+      "sampling/sampling_logp_difference/mean": 0.018666012212634087,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 9.837458947004052e-06,
+      "clip_ratio/high_mean": 2.459364736751013e-06,
+      "clip_ratio/low_mean": 6.463955219260242e-05,
+      "clip_ratio/low_min": 1.0895145351241808e-05,
+      "clip_ratio/region_mean": 6.70989177251613e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16215.0,
+      "completions/mean_length": 7600.34375,
+      "completions/mean_terminated_length": 6855.96630859375,
+      "completions/min_length": 1335.0,
+      "completions/min_terminated_length": 1335.0,
+      "entropy": 0.7636929750442505,
+      "epoch": 0.20607175712971482,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004298723768442869,
+      "learning_rate": 1e-05,
+      "loss": 0.145,
+      "num_tokens": 190462227.0,
+      "reward": 0.515625,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999310374259949,
+      "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05,
+      "sampling/sampling_logp_difference/max": 9.996363639831543,
+      "sampling/sampling_logp_difference/mean": 0.018035393208265305,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 1.4060602325116633e-05,
+      "clip_ratio/high_mean": 3.5151505812791584e-06,
+      "clip_ratio/low_mean": 2.6516039497437305e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.003119024924672e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15151.0,
+      "completions/mean_length": 6512.0,
+      "completions/mean_terminated_length": 6434.267578125,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.9043584689497948,
+      "epoch": 0.20699172033118676,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006741553544998169,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 191312483.0,
+      "reward": 0.484375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 1.778468504198827e-05,
+      "sampling/sampling_logp_difference/max": 10.937172889709473,
+      "sampling/sampling_logp_difference/mean": 0.020878732204437256,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 1.7356085209030425e-05,
+      "clip_ratio/high_mean": 4.339021302257606e-06,
+      "clip_ratio/low_mean": 2.8831826739406097e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.317084781429003e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16339.0,
+      "completions/mean_length": 7178.6875,
+      "completions/mean_terminated_length": 6565.00048828125,
+      "completions/min_length": 847.0,
+      "completions/min_terminated_length": 847.0,
+      "entropy": 0.8899475410580635,
+      "epoch": 0.20791168353265868,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00281486171297729,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 192251235.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2240736484527588,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999714493751526,
+      "sampling/importance_sampling_ratio/min": 9.012543159769848e-05,
+      "sampling/sampling_logp_difference/max": 9.314308166503906,
+      "sampling/sampling_logp_difference/mean": 0.020196784287691116,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 1.5558084214717383e-05,
+      "clip_ratio/high_mean": 3.889521053679346e-06,
+      "clip_ratio/low_mean": 3.0248688972278615e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.413820991227112e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15501.0,
+      "completions/max_terminated_length": 15501.0,
+      "completions/mean_length": 6602.5625,
+      "completions/mean_terminated_length": 6602.5625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.9266818463802338,
+      "epoch": 0.20883164673413063,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005070593673735857,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "num_tokens": 193116763.0,
+      "reward": 0.53125,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999746680259705,
+      "sampling/importance_sampling_ratio/min": 2.726537559283315e-06,
+      "sampling/sampling_logp_difference/max": 12.812478065490723,
+      "sampling/sampling_logp_difference/mean": 0.020026464015245438,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 4.188727416476468e-06,
+      "clip_ratio/high_mean": 1.047181854119117e-06,
+      "clip_ratio/low_mean": 2.959152834591805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.063871008635033e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6818.8828125,
+      "completions/mean_terminated_length": 6430.056640625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.874519519507885,
+      "epoch": 0.20975160993560257,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006362155079841614,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "num_tokens": 194007868.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000009298324585,
+      "sampling/importance_sampling_ratio/min": 0.0005216691642999649,
+      "sampling/sampling_logp_difference/max": 7.55847692489624,
+      "sampling/sampling_logp_difference/mean": 0.01943325623869896,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 9.645911177358357e-06,
+      "clip_ratio/high_mean": 2.4114777943395893e-06,
+      "clip_ratio/low_mean": 6.821557258263056e-05,
+      "clip_ratio/low_min": 1.7265090718865395e-05,
+      "clip_ratio/region_mean": 7.062705049065698e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14536.0,
+      "completions/mean_length": 5515.625,
+      "completions/mean_terminated_length": 5343.111328125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0683523043990135,
+      "epoch": 0.21067157313707452,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003797185141593218,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "num_tokens": 194735980.0,
+      "reward": 0.421875,
+      "reward_std": 0.34010058641433716,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911367893219,
+      "sampling/importance_sampling_ratio/min": 1.137102216830499e-07,
+      "sampling/sampling_logp_difference/max": 15.989612579345703,
+      "sampling/sampling_logp_difference/mean": 0.02120930328965187,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 2.1971412252241862e-05,
+      "clip_ratio/high_mean": 5.4928530630604655e-06,
+      "clip_ratio/low_mean": 4.9151800567415194e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4644653801005916e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14992.0,
+      "completions/mean_length": 5853.546875,
+      "completions/mean_terminated_length": 5770.6298828125,
+      "completions/min_length": 615.0,
+      "completions/min_terminated_length": 615.0,
+      "entropy": 0.7975900694727898,
+      "epoch": 0.21159153633854647,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004124365746974945,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 195504882.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000672340393066,
+      "sampling/importance_sampling_ratio/min": 0.0032877910416573286,
+      "sampling/sampling_logp_difference/max": 5.717539310455322,
+      "sampling/sampling_logp_difference/mean": 0.017819223925471306,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 7.066538728395244e-06,
+      "clip_ratio/high_mean": 2.843255515472265e-06,
+      "clip_ratio/low_mean": 5.1467116236381116e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.431037175185338e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15503.0,
+      "completions/mean_length": 6686.25,
+      "completions/mean_terminated_length": 6532.31787109375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.9018580466508865,
+      "epoch": 0.2125114995400184,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0024995009880512953,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 196379306.0,
+      "reward": 0.421875,
+      "reward_std": 0.35824593901634216,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999300837516785,
+      "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05,
+      "sampling/sampling_logp_difference/max": 10.818918228149414,
+      "sampling/sampling_logp_difference/mean": 0.018989525735378265,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 6.652828687947476e-06,
+      "clip_ratio/high_mean": 2.5722979444253724e-06,
+      "clip_ratio/low_mean": 3.699686294567073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.95691608900961e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16347.0,
+      "completions/mean_length": 7487.3359375,
+      "completions/mean_terminated_length": 7200.3466796875,
+      "completions/min_length": 1222.0,
+      "completions/min_terminated_length": 1222.0,
+      "entropy": 0.9890001565217972,
+      "epoch": 0.21343146274149033,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004295211285352707,
+      "learning_rate": 1e-05,
+      "loss": 0.0754,
+      "num_tokens": 197357397.0,
+      "reward": 0.40625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0006548459641635418,
+      "sampling/sampling_logp_difference/max": 7.33111047744751,
+      "sampling/sampling_logp_difference/mean": 0.02209121733903885,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 6.0850939007650595e-06,
+      "clip_ratio/high_mean": 1.5212734751912649e-06,
+      "clip_ratio/low_mean": 2.9443070673096372e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0964344205131056e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15825.0,
+      "completions/mean_length": 7233.484375,
+      "completions/mean_terminated_length": 6938.30615234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.9683803990483284,
+      "epoch": 0.21435142594296228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003119673579931259,
+      "learning_rate": 1e-05,
+      "loss": 0.0914,
+      "num_tokens": 198303795.0,
+      "reward": 0.328125,
+      "reward_std": 0.23014704883098602,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000243186950684,
+      "sampling/importance_sampling_ratio/min": 0.020358745008707047,
+      "sampling/sampling_logp_difference/max": 3.89424467086792,
+      "sampling/sampling_logp_difference/mean": 0.021085180342197418,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 7.963812095113099e-06,
+      "clip_ratio/high_mean": 1.9909530237782747e-06,
+      "clip_ratio/low_mean": 4.031422963635123e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.23051826601295e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15733.0,
+      "completions/mean_length": 6457.78125,
+      "completions/mean_terminated_length": 6300.22265625,
+      "completions/min_length": 850.0,
+      "completions/min_terminated_length": 850.0,
+      "entropy": 0.8881053999066353,
+      "epoch": 0.21527138914443422,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033790848683565855,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 199154735.0,
+      "reward": 0.3828125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998799562454224,
+      "sampling/importance_sampling_ratio/min": 2.872048128210736e-07,
+      "sampling/sampling_logp_difference/max": 15.063070297241211,
+      "sampling/sampling_logp_difference/mean": 0.01950821653008461,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 9.059622016138746e-06,
+      "clip_ratio/high_mean": 3.3430123380639998e-06,
+      "clip_ratio/low_mean": 2.2856192117615137e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6199204512522556e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 7904.40625,
+      "completions/mean_terminated_length": 7769.81005859375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9881557524204254,
+      "epoch": 0.21619135234590617,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0021492803934961557,
+      "learning_rate": 1e-05,
+      "loss": 0.0179,
+      "num_tokens": 200185643.0,
+      "reward": 0.359375,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001094341278076,
+      "sampling/importance_sampling_ratio/min": 0.001458622980862856,
+      "sampling/sampling_logp_difference/max": 6.530262470245361,
+      "sampling/sampling_logp_difference/mean": 0.021201875060796738,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 6.9962839006620925e-06,
+      "clip_ratio/high_mean": 1.7490709751655231e-06,
+      "clip_ratio/low_mean": 3.018811844412994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.193718976035598e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15328.0,
+      "completions/max_terminated_length": 15328.0,
+      "completions/mean_length": 7414.4921875,
+      "completions/mean_terminated_length": 7414.4921875,
+      "completions/min_length": 949.0,
+      "completions/min_terminated_length": 949.0,
+      "entropy": 0.9571134969592094,
+      "epoch": 0.21711131554737811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0037221095990389585,
+      "learning_rate": 1e-05,
+      "loss": 0.0327,
+      "num_tokens": 201153114.0,
+      "reward": 0.4375,
+      "reward_std": 0.248829185962677,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999958872795105,
+      "sampling/importance_sampling_ratio/min": 0.0009130563121289015,
+      "sampling/sampling_logp_difference/max": 6.99871301651001,
+      "sampling/sampling_logp_difference/mean": 0.021356744691729546,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 1.1248092050664127e-05,
+      "clip_ratio/high_mean": 2.8120230126660317e-06,
+      "clip_ratio/low_mean": 5.4354991334548686e-05,
+      "clip_ratio/low_min": 6.868132004456129e-06,
+      "clip_ratio/region_mean": 5.716701480196207e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15835.0,
+      "completions/max_terminated_length": 15835.0,
+      "completions/mean_length": 5955.953125,
+      "completions/mean_terminated_length": 5955.953125,
+      "completions/min_length": 1394.0,
+      "completions/min_terminated_length": 1394.0,
+      "entropy": 0.730999618768692,
+      "epoch": 0.21803127874885003,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006285305600613356,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "num_tokens": 201933044.0,
+      "reward": 0.59375,
+      "reward_std": 0.31011277437210083,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999420642852783,
+      "sampling/importance_sampling_ratio/min": 0.007535050623118877,
+      "sampling/sampling_logp_difference/max": 4.888189792633057,
+      "sampling/sampling_logp_difference/mean": 0.016975615173578262,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 7.226686648209579e-06,
+      "clip_ratio/high_mean": 3.094216481258627e-06,
+      "clip_ratio/low_mean": 4.66828214484849e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.977703792974353e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15959.0,
+      "completions/mean_length": 6923.3515625,
+      "completions/mean_terminated_length": 6458.0732421875,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "entropy": 0.9938417226076126,
+      "epoch": 0.21895124195032198,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005667983554303646,
+      "learning_rate": 1e-05,
+      "loss": 0.0793,
+      "num_tokens": 202837281.0,
+      "reward": 0.2578125,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999980092048645,
+      "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05,
+      "sampling/sampling_logp_difference/max": 10.402952194213867,
+      "sampling/sampling_logp_difference/mean": 0.022059854120016098,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 5.2318769121484365e-06,
+      "clip_ratio/high_mean": 1.3079692280371091e-06,
+      "clip_ratio/low_mean": 4.239228087499214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3700250216716086e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14726.0,
+      "completions/max_terminated_length": 14726.0,
+      "completions/mean_length": 5930.9296875,
+      "completions/mean_terminated_length": 5930.9296875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.8100385963916779,
+      "epoch": 0.21987120515179392,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004052883945405483,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 203614448.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999989926815033,
+      "sampling/importance_sampling_ratio/min": 0.00015170808183029294,
+      "sampling/sampling_logp_difference/max": 8.79355239868164,
+      "sampling/sampling_logp_difference/mean": 0.018519222736358643,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 4.905230980511988e-06,
+      "clip_ratio/high_mean": 1.226307745127997e-06,
+      "clip_ratio/low_mean": 5.500513248080097e-05,
+      "clip_ratio/low_min": 7.924934834591113e-06,
+      "clip_ratio/region_mean": 5.6231440112242126e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14996.0,
+      "completions/mean_length": 6911.1015625,
+      "completions/mean_terminated_length": 6108.3134765625,
+      "completions/min_length": 862.0,
+      "completions/min_terminated_length": 862.0,
+      "entropy": 0.9260227829217911,
+      "epoch": 0.22079116835326587,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004494607914239168,
+      "learning_rate": 1e-05,
+      "loss": 0.0269,
+      "num_tokens": 204518261.0,
+      "reward": 0.4140625,
+      "reward_std": 0.34033796191215515,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998886585235596,
+      "sampling/importance_sampling_ratio/min": 0.0015266009140759706,
+      "sampling/sampling_logp_difference/max": 6.484711647033691,
+      "sampling/sampling_logp_difference/mean": 0.020527629181742668,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 8.293764039990492e-06,
+      "clip_ratio/high_mean": 2.073441009997623e-06,
+      "clip_ratio/low_mean": 4.75325257411896e-05,
+      "clip_ratio/low_min": 3.599504680096288e-06,
+      "clip_ratio/region_mean": 4.960596663750039e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14637.0,
+      "completions/mean_length": 6972.921875,
+      "completions/mean_terminated_length": 6823.5400390625,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 1.0095533654093742,
+      "epoch": 0.22171113155473782,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029451537411659956,
+      "learning_rate": 1e-05,
+      "loss": 0.0108,
+      "num_tokens": 205433843.0,
+      "reward": 0.3515625,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
+      "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05,
+      "sampling/sampling_logp_difference/max": 10.53177547454834,
+      "sampling/sampling_logp_difference/mean": 0.02013089321553707,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 4.163383164268453e-05,
+      "clip_ratio/high_mean": 1.382379150527413e-05,
+      "clip_ratio/low_mean": 3.86000854177837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2423876240936806e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 6706.6640625,
+      "completions/mean_terminated_length": 6313.2763671875,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 0.8647518903017044,
+      "epoch": 0.22263109475620976,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003371767932549119,
+      "learning_rate": 1e-05,
+      "loss": 0.073,
+      "num_tokens": 206310296.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999367594718933,
+      "sampling/importance_sampling_ratio/min": 2.948181463580113e-05,
+      "sampling/sampling_logp_difference/max": 10.431736946105957,
+      "sampling/sampling_logp_difference/mean": 0.019770190119743347,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4946740381892596e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4946740381892596e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16136.0,
+      "completions/mean_length": 6882.609375,
+      "completions/mean_terminated_length": 6415.32763671875,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "entropy": 1.013342760503292,
+      "epoch": 0.22355105795768168,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0016336971893906593,
+      "learning_rate": 1e-05,
+      "loss": 0.0281,
+      "num_tokens": 207210974.0,
+      "reward": 0.359375,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999210834503174,
+      "sampling/importance_sampling_ratio/min": 0.0013267879839986563,
+      "sampling/sampling_logp_difference/max": 6.624994277954102,
+      "sampling/sampling_logp_difference/mean": 0.02139991894364357,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 1.4866403944324702e-05,
+      "clip_ratio/high_mean": 3.7166009860811755e-06,
+      "clip_ratio/low_mean": 3.938925010515959e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.310585177336179e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15203.0,
+      "completions/max_terminated_length": 15203.0,
+      "completions/mean_length": 6195.7421875,
+      "completions/mean_terminated_length": 6195.7421875,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.8448907434940338,
+      "epoch": 0.22447102115915363,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005036406684666872,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "num_tokens": 208021893.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3453505039215088,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999955892562866,
+      "sampling/importance_sampling_ratio/min": 0.0040348549373447895,
+      "sampling/sampling_logp_difference/max": 5.512784957885742,
+      "sampling/sampling_logp_difference/mean": 0.018679853528738022,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 1.1244883353356272e-05,
+      "clip_ratio/high_mean": 2.811220838339068e-06,
+      "clip_ratio/low_mean": 3.422392001084518e-05,
+      "clip_ratio/low_min": 6.451612989621935e-06,
+      "clip_ratio/region_mean": 3.703514119024476e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16305.0,
+      "completions/mean_length": 6829.609375,
+      "completions/mean_terminated_length": 6521.40283203125,
+      "completions/min_length": 735.0,
+      "completions/min_terminated_length": 735.0,
+      "entropy": 0.8679579794406891,
+      "epoch": 0.22539098436062557,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029643685556948185,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 208912059.0,
+      "reward": 0.46875,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999761581420898,
+      "sampling/importance_sampling_ratio/min": 0.00038063788088038564,
+      "sampling/sampling_logp_difference/max": 7.873661994934082,
+      "sampling/sampling_logp_difference/mean": 0.018488366156816483,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 2.2700600311509334e-05,
+      "clip_ratio/high_mean": 5.675150077877333e-06,
+      "clip_ratio/low_mean": 3.138338854569156e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.705853873725573e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14503.0,
+      "completions/max_terminated_length": 14503.0,
+      "completions/mean_length": 5444.4453125,
+      "completions/mean_terminated_length": 5444.4453125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 1.0460086688399315,
+      "epoch": 0.22631094756209752,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035942886024713516,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "num_tokens": 209627804.0,
+      "reward": 0.484375,
+      "reward_std": 0.338498055934906,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99997478723526,
+      "sampling/importance_sampling_ratio/min": 0.03179635480046272,
+      "sampling/sampling_logp_difference/max": 3.4484035968780518,
+      "sampling/sampling_logp_difference/mean": 0.020146891474723816,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 1.477029400120955e-05,
+      "clip_ratio/high_mean": 4.552578502625693e-06,
+      "clip_ratio/low_mean": 5.265122354103369e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.720380158891203e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16244.0,
+      "completions/mean_length": 7657.390625,
+      "completions/mean_terminated_length": 7152.544921875,
+      "completions/min_length": 1048.0,
+      "completions/min_terminated_length": 1048.0,
+      "entropy": 0.9528728649020195,
+      "epoch": 0.22723091076356947,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0044983453117311,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 210630150.0,
+      "reward": 0.4375,
+      "reward_std": 0.26249876618385315,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000007152557373,
+      "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05,
+      "sampling/sampling_logp_difference/max": 10.158285140991211,
+      "sampling/sampling_logp_difference/mean": 0.02131088823080063,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 8.607642712377128e-06,
+      "clip_ratio/high_mean": 2.151910678094282e-06,
+      "clip_ratio/low_mean": 2.2759413695894182e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.491132454451872e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16284.0,
+      "completions/mean_length": 7574.3515625,
+      "completions/mean_terminated_length": 7504.984375,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 1.0009776800870895,
+      "epoch": 0.2281508739650414,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006095650140196085,
+      "learning_rate": 1e-05,
+      "loss": 0.0566,
+      "num_tokens": 211620355.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000433921813965,
+      "sampling/importance_sampling_ratio/min": 0.0013946897815912962,
+      "sampling/sampling_logp_difference/max": 6.575083255767822,
+      "sampling/sampling_logp_difference/mean": 0.021727774292230606,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 1.764823082339717e-05,
+      "clip_ratio/high_mean": 5.141430960975413e-06,
+      "clip_ratio/low_mean": 5.936152001595474e-05,
+      "clip_ratio/low_min": 9.155588486464694e-06,
+      "clip_ratio/region_mean": 6.450295177273802e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14915.0,
+      "completions/mean_length": 7919.6875,
+      "completions/mean_terminated_length": 7716.54443359375,
+      "completions/min_length": 1517.0,
+      "completions/min_terminated_length": 1517.0,
+      "entropy": 1.0405654236674309,
+      "epoch": 0.22907083716651333,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037038614973425865,
+      "learning_rate": 1e-05,
+      "loss": 0.0364,
+      "num_tokens": 212654747.0,
+      "reward": 0.3125,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999381899833679,
+      "sampling/importance_sampling_ratio/min": 0.0057550109922885895,
+      "sampling/sampling_logp_difference/max": 5.157684326171875,
+      "sampling/sampling_logp_difference/mean": 0.022051017731428146,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 1.265254240934155e-05,
+      "clip_ratio/high_mean": 3.1631356023353874e-06,
+      "clip_ratio/low_mean": 4.716233138424286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.032546687289141e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 8613.4765625,
+      "completions/mean_terminated_length": 7735.0693359375,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.890489287674427,
+      "epoch": 0.22999080036798528,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00325607368722558,
+      "learning_rate": 1e-05,
+      "loss": 0.0571,
+      "num_tokens": 213774584.0,
+      "reward": 0.40625,
+      "reward_std": 0.33668074011802673,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000060796737671,
+      "sampling/importance_sampling_ratio/min": 1.670176425250247e-05,
+      "sampling/sampling_logp_difference/max": 10.999996185302734,
+      "sampling/sampling_logp_difference/mean": 0.020002499222755432,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 1.6404605503339553e-05,
+      "clip_ratio/high_mean": 4.101151375834888e-06,
+      "clip_ratio/low_mean": 3.880500707964529e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2906158682853857e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16022.0,
+      "completions/mean_length": 7324.8984375,
+      "completions/mean_terminated_length": 6473.1884765625,
+      "completions/min_length": 704.0,
+      "completions/min_terminated_length": 704.0,
+      "entropy": 0.761004202067852,
+      "epoch": 0.23091076356945722,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038265211042016745,
+      "learning_rate": 1e-05,
+      "loss": 0.0717,
+      "num_tokens": 214728371.0,
+      "reward": 0.515625,
+      "reward_std": 0.32719239592552185,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000168085098267,
+      "sampling/importance_sampling_ratio/min": 0.0003049026126973331,
+      "sampling/sampling_logp_difference/max": 8.095518112182617,
+      "sampling/sampling_logp_difference/mean": 0.018367979675531387,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 5.624549885396846e-06,
+      "clip_ratio/high_mean": 1.4061374713492114e-06,
+      "clip_ratio/low_mean": 3.6433707123251224e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7839844594600436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14167.0,
+      "completions/max_terminated_length": 14167.0,
+      "completions/mean_length": 6422.0859375,
+      "completions/mean_terminated_length": 6422.0859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.9946094751358032,
+      "epoch": 0.23183072677092917,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002729539293795824,
+      "learning_rate": 1e-05,
+      "loss": 0.0158,
+      "num_tokens": 215570806.0,
+      "reward": 0.3515625,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999935030937195,
+      "sampling/importance_sampling_ratio/min": 0.026308411732316017,
+      "sampling/sampling_logp_difference/max": 3.637866497039795,
+      "sampling/sampling_logp_difference/mean": 0.021903935819864273,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 7.2379848461423535e-06,
+      "clip_ratio/high_mean": 1.8094962115355884e-06,
+      "clip_ratio/low_mean": 3.17277934982485e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353728982347093e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15585.0,
+      "completions/mean_length": 6845.2890625,
+      "completions/mean_terminated_length": 6693.88134765625,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8822609707713127,
+      "epoch": 0.23275068997240111,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004974282346665859,
+      "learning_rate": 1e-05,
+      "loss": 0.1011,
+      "num_tokens": 216465635.0,
+      "reward": 0.5390625,
+      "reward_std": 0.30061954259872437,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
+      "sampling/importance_sampling_ratio/min": 8.749838889343664e-05,
+      "sampling/sampling_logp_difference/max": 9.343890190124512,
+      "sampling/sampling_logp_difference/mean": 0.019389234483242035,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 1.58592818024772e-05,
+      "clip_ratio/high_mean": 3.9648204506193e-06,
+      "clip_ratio/low_mean": 4.096964960353944e-05,
+      "clip_ratio/low_min": 1.7403560605089297e-05,
+      "clip_ratio/region_mean": 4.49344687467601e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 7805.484375,
+      "completions/mean_terminated_length": 7528.7578125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.9977599084377289,
+      "epoch": 0.23367065317387303,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0033159854356199503,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 217485089.0,
+      "reward": 0.421875,
+      "reward_std": 0.33114904165267944,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999412298202515,
+      "sampling/importance_sampling_ratio/min": 7.967943383846432e-05,
+      "sampling/sampling_logp_difference/max": 9.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.021925684064626694,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 1.8265397557115648e-05,
+      "clip_ratio/high_mean": 4.566349389278912e-06,
+      "clip_ratio/low_mean": 4.044636898470344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5012717691861326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15681.0,
+      "completions/mean_length": 7737.5546875,
+      "completions/mean_terminated_length": 7530.04052734375,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.8667014688253403,
+      "epoch": 0.23459061637534498,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034952745772898197,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "num_tokens": 218496040.0,
+      "reward": 0.453125,
+      "reward_std": 0.3085102438926697,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999128580093384,
+      "sampling/importance_sampling_ratio/min": 6.726370338583365e-05,
+      "sampling/sampling_logp_difference/max": 9.606889724731445,
+      "sampling/sampling_logp_difference/mean": 0.019742710515856743,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 8.244294804171659e-06,
+      "clip_ratio/high_mean": 2.0610737010429148e-06,
+      "clip_ratio/low_mean": 3.204250072030845e-05,
+      "clip_ratio/low_min": 3.323495775475749e-06,
+      "clip_ratio/region_mean": 3.410357436450795e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15858.0,
+      "completions/mean_length": 7365.84375,
+      "completions/mean_terminated_length": 6601.59326171875,
+      "completions/min_length": 744.0,
+      "completions/min_terminated_length": 744.0,
+      "entropy": 0.8151945173740387,
+      "epoch": 0.23551057957681693,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038676802068948746,
+      "learning_rate": 1e-05,
+      "loss": 0.0667,
+      "num_tokens": 219459140.0,
+      "reward": 0.46875,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00023387260443996638,
+      "sampling/sampling_logp_difference/max": 8.360733985900879,
+      "sampling/sampling_logp_difference/mean": 0.018882082775235176,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 6.87833608026267e-06,
+      "clip_ratio/high_mean": 2.9462287329806713e-06,
+      "clip_ratio/low_mean": 5.435333650893881e-05,
+      "clip_ratio/low_min": 5.33937054569833e-06,
+      "clip_ratio/region_mean": 5.729956546929316e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14556.0,
+      "completions/mean_length": 6448.0078125,
+      "completions/mean_terminated_length": 6369.771484375,
+      "completions/min_length": 1128.0,
+      "completions/min_terminated_length": 1128.0,
+      "entropy": 0.9546648040413857,
+      "epoch": 0.23643054277828887,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004310046322643757,
+      "learning_rate": 1e-05,
+      "loss": 0.1082,
+      "num_tokens": 220304605.0,
+      "reward": 0.5703125,
+      "reward_std": 0.35611939430236816,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999396800994873,
+      "sampling/importance_sampling_ratio/min": 0.0001234127557836473,
+      "sampling/sampling_logp_difference/max": 8.99997615814209,
+      "sampling/sampling_logp_difference/mean": 0.020253397524356842,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 6.196094091137638e-06,
+      "clip_ratio/high_mean": 1.5490235227844096e-06,
+      "clip_ratio/low_mean": 2.5416685957679874e-05,
+      "clip_ratio/low_min": 5.5736391004757024e-06,
+      "clip_ratio/region_mean": 2.696570959415112e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16037.0,
+      "completions/mean_length": 7457.6484375,
+      "completions/mean_terminated_length": 6941.24755859375,
+      "completions/min_length": 604.0,
+      "completions/min_terminated_length": 604.0,
+      "entropy": 0.8182889074087143,
+      "epoch": 0.23735050597976082,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026646999176591635,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 221281968.0,
+      "reward": 0.4453125,
+      "reward_std": 0.2012200653553009,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173283576965,
+      "sampling/importance_sampling_ratio/min": 2.902353571698768e-06,
+      "sampling/sampling_logp_difference/max": 12.749988555908203,
+      "sampling/sampling_logp_difference/mean": 0.019208962097764015,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 1.6189535017474554e-05,
+      "clip_ratio/high_mean": 4.047383754368639e-06,
+      "clip_ratio/low_mean": 3.127787306311802e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.532525670379982e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 8561.109375,
+      "completions/mean_terminated_length": 7969.79052734375,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.9581378549337387,
+      "epoch": 0.23827046918123276,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0016026750672608614,
+      "learning_rate": 1e-05,
+      "loss": 0.0131,
+      "num_tokens": 222399046.0,
+      "reward": 0.34375,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 1.653693971093162e-06,
+      "sampling/sampling_logp_difference/max": 13.312499046325684,
+      "sampling/sampling_logp_difference/mean": 0.02173236384987831,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 1.4200771602190798e-05,
+      "clip_ratio/high_mean": 4.3255887476334465e-06,
+      "clip_ratio/low_mean": 5.2955770115659107e-05,
+      "clip_ratio/low_min": 3.402656830076012e-06,
+      "clip_ratio/region_mean": 5.7281358749605715e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16239.0,
+      "completions/mean_length": 7152.34375,
+      "completions/mean_terminated_length": 7079.6533203125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9052041247487068,
+      "epoch": 0.23919043238270468,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005460259038954973,
+      "learning_rate": 1e-05,
+      "loss": 0.0845,
+      "num_tokens": 223335010.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3356297016143799,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999966621398926,
+      "sampling/importance_sampling_ratio/min": 0.010161337442696095,
+      "sampling/sampling_logp_difference/max": 4.589165210723877,
+      "sampling/sampling_logp_difference/mean": 0.01986619457602501,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 1.4350314813782461e-05,
+      "clip_ratio/high_mean": 3.5875787034456152e-06,
+      "clip_ratio/low_mean": 3.81288905373367e-05,
+      "clip_ratio/low_min": 8.099272235995159e-06,
+      "clip_ratio/region_mean": 4.1716469809216505e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15631.0,
+      "completions/mean_length": 6678.65625,
+      "completions/mean_terminated_length": 6524.603515625,
+      "completions/min_length": 963.0,
+      "completions/min_terminated_length": 963.0,
+      "entropy": 0.9043187350034714,
+      "epoch": 0.24011039558417663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005933742038905621,
+      "learning_rate": 1e-05,
+      "loss": 0.0966,
+      "num_tokens": 224207006.0,
+      "reward": 0.484375,
+      "reward_std": 0.3316681981086731,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000031590461731,
+      "sampling/importance_sampling_ratio/min": 0.0011734943836927414,
+      "sampling/sampling_logp_difference/max": 6.747769355773926,
+      "sampling/sampling_logp_difference/mean": 0.019827336072921753,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 1.6498819377375185e-05,
+      "clip_ratio/high_mean": 4.124704844343796e-06,
+      "clip_ratio/low_mean": 3.601791678420341e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.014262168539062e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15972.0,
+      "completions/mean_length": 6999.0390625,
+      "completions/mean_terminated_length": 6850.07177734375,
+      "completions/min_length": 990.0,
+      "completions/min_terminated_length": 990.0,
+      "entropy": 0.8109970837831497,
+      "epoch": 0.24103035878564857,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003635740838944912,
+      "learning_rate": 1e-05,
+      "loss": 0.104,
+      "num_tokens": 225122891.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999303817749023,
+      "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05,
+      "sampling/sampling_logp_difference/max": 10.987512588500977,
+      "sampling/sampling_logp_difference/mean": 0.018912551924586296,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 9.527577958579059e-06,
+      "clip_ratio/high_mean": 2.3818944896447647e-06,
+      "clip_ratio/low_mean": 3.766565987461945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.004755419373396e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15713.0,
+      "completions/mean_length": 7483.7109375,
+      "completions/mean_terminated_length": 7045.9912109375,
+      "completions/min_length": 1153.0,
+      "completions/min_terminated_length": 1153.0,
+      "entropy": 0.9473970532417297,
+      "epoch": 0.24195032198712052,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003405241761356592,
+      "learning_rate": 1e-05,
+      "loss": 0.0458,
+      "num_tokens": 226102462.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00002920627594,
+      "sampling/importance_sampling_ratio/min": 0.00525119062513113,
+      "sampling/sampling_logp_difference/max": 5.249300479888916,
+      "sampling/sampling_logp_difference/mean": 0.021076779812574387,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 1.5867321963014547e-05,
+      "clip_ratio/high_mean": 3.966830490753637e-06,
+      "clip_ratio/low_mean": 3.8259706570897833e-05,
+      "clip_ratio/low_min": 3.549019083948224e-06,
+      "clip_ratio/region_mean": 4.2226537743772496e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 7569.03125,
+      "completions/mean_terminated_length": 7357.47216796875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9231455475091934,
+      "epoch": 0.24287028518859247,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0025927501264959574,
+      "learning_rate": 1e-05,
+      "loss": 0.0801,
+      "num_tokens": 227093562.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19097033143043518,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0052477638237178326,
+      "sampling/sampling_logp_difference/max": 5.249953269958496,
+      "sampling/sampling_logp_difference/mean": 0.020578444004058838,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 1.344091060673236e-05,
+      "clip_ratio/high_mean": 3.36022765168309e-06,
+      "clip_ratio/low_mean": 4.253613235505327e-05,
+      "clip_ratio/low_min": 3.5579084851633525e-06,
+      "clip_ratio/region_mean": 4.5896360120423196e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 7589.2734375,
+      "completions/mean_terminated_length": 7378.2001953125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "entropy": 0.9265239909291267,
+      "epoch": 0.24379024839006438,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030512227676808834,
+      "learning_rate": 1e-05,
+      "loss": 0.04,
+      "num_tokens": 228086405.0,
+      "reward": 0.4296875,
+      "reward_std": 0.27905434370040894,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000004529953003,
+      "sampling/importance_sampling_ratio/min": 0.0002165911573683843,
+      "sampling/sampling_logp_difference/max": 8.437499046325684,
+      "sampling/sampling_logp_difference/mean": 0.020208362489938736,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 1.9613525410022703e-05,
+      "clip_ratio/high_mean": 4.903381352505676e-06,
+      "clip_ratio/low_mean": 3.184792547017423e-05,
+      "clip_ratio/low_min": 7.29296516510658e-06,
+      "clip_ratio/region_mean": 3.675130722058384e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16275.0,
+      "completions/mean_length": 8420.6875,
+      "completions/mean_terminated_length": 8096.97509765625,
+      "completions/min_length": 1114.0,
+      "completions/min_terminated_length": 1114.0,
+      "entropy": 0.9572964608669281,
+      "epoch": 0.24471021159153633,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022430522367358208,
+      "learning_rate": 1e-05,
+      "loss": 0.0444,
+      "num_tokens": 229183765.0,
+      "reward": 0.34375,
+      "reward_std": 0.309583842754364,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 0.00029693738906644285,
+      "sampling/sampling_logp_difference/max": 8.121989250183105,
+      "sampling/sampling_logp_difference/mean": 0.021570362150669098,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 6.728750577167375e-06,
+      "clip_ratio/high_mean": 1.6821876442918438e-06,
+      "clip_ratio/low_mean": 2.1682553096979973e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.336474062758498e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15736.0,
+      "completions/mean_length": 6809.765625,
+      "completions/mean_terminated_length": 6579.984375,
+      "completions/min_length": 860.0,
+      "completions/min_terminated_length": 860.0,
+      "entropy": 0.884086549282074,
+      "epoch": 0.24563017479300828,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004295065999031067,
+      "learning_rate": 1e-05,
+      "loss": 0.1058,
+      "num_tokens": 230077607.0,
+      "reward": 0.484375,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294281005859,
+      "sampling/importance_sampling_ratio/min": 0.00754612497985363,
+      "sampling/sampling_logp_difference/max": 4.886721134185791,
+      "sampling/sampling_logp_difference/mean": 0.019895706325769424,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 2.8609347509700456e-05,
+      "clip_ratio/high_mean": 7.152336877425114e-06,
+      "clip_ratio/low_mean": 5.158006410965754e-05,
+      "clip_ratio/low_min": 5.210069957684027e-06,
+      "clip_ratio/region_mean": 5.873240070286556e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15080.0,
+      "completions/mean_length": 7340.6953125,
+      "completions/mean_terminated_length": 6973.0810546875,
+      "completions/min_length": 1616.0,
+      "completions/min_terminated_length": 1616.0,
+      "entropy": 0.9920620769262314,
+      "epoch": 0.24655013799448022,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004631794057786465,
+      "learning_rate": 1e-05,
+      "loss": 0.0096,
+      "num_tokens": 231035616.0,
+      "reward": 0.4375,
+      "reward_std": 0.3235401213169098,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999337792396545,
+      "sampling/importance_sampling_ratio/min": 0.0002508950710762292,
+      "sampling/sampling_logp_difference/max": 8.290475845336914,
+      "sampling/sampling_logp_difference/mean": 0.020591016858816147,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.3085940774290066e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3085940774290066e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14120.0,
+      "completions/mean_length": 6748.875,
+      "completions/mean_terminated_length": 6595.93701171875,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.9867061004042625,
+      "epoch": 0.24747010119595217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035752104595303535,
+      "learning_rate": 1e-05,
+      "loss": 0.0455,
+      "num_tokens": 231920056.0,
+      "reward": 0.40625,
+      "reward_std": 0.21884137392044067,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999653100967407,
+      "sampling/importance_sampling_ratio/min": 0.0003869794018100947,
+      "sampling/sampling_logp_difference/max": 7.8571391105651855,
+      "sampling/sampling_logp_difference/mean": 0.02061416581273079,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 1.2506750408647349e-05,
+      "clip_ratio/high_mean": 3.1266876021618373e-06,
+      "clip_ratio/low_mean": 3.10397430212106e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.416643085074611e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 7260.3046875,
+      "completions/mean_terminated_length": 7188.46435546875,
+      "completions/min_length": 1384.0,
+      "completions/min_terminated_length": 1384.0,
+      "entropy": 1.0388494208455086,
+      "epoch": 0.24839006439742412,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036644963547587395,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 232869159.0,
+      "reward": 0.390625,
+      "reward_std": 0.2359209954738617,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999546408653259,
+      "sampling/importance_sampling_ratio/min": 0.0008660226594656706,
+      "sampling/sampling_logp_difference/max": 7.051599502563477,
+      "sampling/sampling_logp_difference/mean": 0.02120530977845192,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 2.704355301830219e-05,
+      "clip_ratio/high_mean": 6.760888254575548e-06,
+      "clip_ratio/low_mean": 3.1861192269388994e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.862208097871189e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16073.0,
+      "completions/max_terminated_length": 16073.0,
+      "completions/mean_length": 6354.4609375,
+      "completions/mean_terminated_length": 6354.4609375,
+      "completions/min_length": 1035.0,
+      "completions/min_terminated_length": 1035.0,
+      "entropy": 0.8405331820249557,
+      "epoch": 0.24931002759889603,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004709267523139715,
+      "learning_rate": 1e-05,
+      "loss": 0.0039,
+      "num_tokens": 233702842.0,
+      "reward": 0.546875,
+      "reward_std": 0.3214184641838074,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 0.0046309432946145535,
+      "sampling/sampling_logp_difference/max": 5.37499475479126,
+      "sampling/sampling_logp_difference/mean": 0.019126038998365402,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 9.749228638611385e-06,
+      "clip_ratio/high_mean": 2.437307159652846e-06,
+      "clip_ratio/low_mean": 3.855073941849696e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.098804652130639e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16026.0,
+      "completions/mean_length": 6514.578125,
+      "completions/mean_terminated_length": 6357.9208984375,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "entropy": 1.0254098922014236,
+      "epoch": 0.250229990800368,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003066045930609107,
+      "learning_rate": 1e-05,
+      "loss": 0.0757,
+      "num_tokens": 234556348.0,
+      "reward": 0.4375,
+      "reward_std": 0.31246688961982727,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999805092811584,
+      "sampling/importance_sampling_ratio/min": 0.005210204049944878,
+      "sampling/sampling_logp_difference/max": 5.257136344909668,
+      "sampling/sampling_logp_difference/mean": 0.019960148259997368,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 1.0475813724042382e-05,
+      "clip_ratio/high_mean": 2.6189534310105955e-06,
+      "clip_ratio/low_mean": 3.487835761006863e-05,
+      "clip_ratio/low_min": 2.9392399483185727e-06,
+      "clip_ratio/region_mean": 3.749731081370555e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 7379.5546875,
+      "completions/mean_terminated_length": 7236.62744140625,
+      "completions/min_length": 701.0,
+      "completions/min_terminated_length": 701.0,
+      "entropy": 1.0397320613265038,
+      "epoch": 0.2511499540018399,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005132520105689764,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 235521091.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519364118576,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999256134033203,
+      "sampling/importance_sampling_ratio/min": 0.00016659013635944575,
+      "sampling/sampling_logp_difference/max": 8.699974060058594,
+      "sampling/sampling_logp_difference/mean": 0.021417103707790375,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 1.9904123973901733e-05,
+      "clip_ratio/high_mean": 5.776861314643611e-06,
+      "clip_ratio/low_mean": 2.6659268655748747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2436129686175263e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14565.0,
+      "completions/mean_length": 7837.1640625,
+      "completions/mean_terminated_length": 7632.04052734375,
+      "completions/min_length": 1346.0,
+      "completions/min_terminated_length": 1346.0,
+      "entropy": 0.8400963917374611,
+      "epoch": 0.25206991720331184,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028969801496714354,
+      "learning_rate": 1e-05,
+      "loss": 0.0143,
+      "num_tokens": 236544160.0,
+      "reward": 0.3828125,
+      "reward_std": 0.29378965497016907,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999887943267822,
+      "sampling/importance_sampling_ratio/min": 2.883308241052873e-07,
+      "sampling/sampling_logp_difference/max": 15.059157371520996,
+      "sampling/sampling_logp_difference/mean": 0.019267702475190163,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 8.562770290154731e-06,
+      "clip_ratio/high_mean": 2.1406925725386827e-06,
+      "clip_ratio/low_mean": 4.060094340729847e-05,
+      "clip_ratio/low_min": 3.8700886761944275e-06,
+      "clip_ratio/region_mean": 4.2741635979837156e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15350.0,
+      "completions/mean_length": 6696.3515625,
+      "completions/mean_terminated_length": 6542.57958984375,
+      "completions/min_length": 1239.0,
+      "completions/min_terminated_length": 1239.0,
+      "entropy": 0.8495818004012108,
+      "epoch": 0.2529898804047838,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003412836929783225,
+      "learning_rate": 1e-05,
+      "loss": 0.0803,
+      "num_tokens": 237423101.0,
+      "reward": 0.515625,
+      "reward_std": 0.37981897592544556,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.012152798473834991,
+      "sampling/sampling_logp_difference/max": 4.410195827484131,
+      "sampling/sampling_logp_difference/mean": 0.018458625301718712,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 1.1463653436294408e-05,
+      "clip_ratio/high_mean": 3.646129641765583e-06,
+      "clip_ratio/low_mean": 6.144847083078275e-05,
+      "clip_ratio/low_min": 1.110105540647055e-05,
+      "clip_ratio/region_mean": 6.509460160941671e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15666.0,
+      "completions/mean_length": 7700.3671875,
+      "completions/mean_terminated_length": 7121.45849609375,
+      "completions/min_length": 844.0,
+      "completions/min_terminated_length": 844.0,
+      "entropy": 0.8258870914578438,
+      "epoch": 0.25390984360625574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024443145375698805,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "num_tokens": 238429956.0,
+      "reward": 0.375,
+      "reward_std": 0.2872493863105774,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999113082885742,
+      "sampling/importance_sampling_ratio/min": 0.00026112530031241477,
+      "sampling/sampling_logp_difference/max": 8.250510215759277,
+      "sampling/sampling_logp_difference/mean": 0.019427984952926636,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 4.218127742205979e-06,
+      "clip_ratio/high_mean": 1.0545319355514948e-06,
+      "clip_ratio/low_mean": 1.7289162997258245e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.834369493280974e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16112.0,
+      "completions/mean_length": 6255.21875,
+      "completions/mean_terminated_length": 6094.44482421875,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.8179014846682549,
+      "epoch": 0.2548298068077277,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022747826296836138,
+      "learning_rate": 1e-05,
+      "loss": 0.0222,
+      "num_tokens": 239250160.0,
+      "reward": 0.5234375,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.0002633975527714938,
+      "sampling/sampling_logp_difference/max": 8.241846084594727,
+      "sampling/sampling_logp_difference/mean": 0.018723051995038986,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 1.698448841125355e-05,
+      "clip_ratio/high_mean": 5.369374321162468e-06,
+      "clip_ratio/low_mean": 6.14647315160255e-05,
+      "clip_ratio/low_min": 5.043576493335422e-06,
+      "clip_ratio/region_mean": 6.683410583718796e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15321.0,
+      "completions/max_terminated_length": 15321.0,
+      "completions/mean_length": 6914.9609375,
+      "completions/mean_terminated_length": 6914.9609375,
+      "completions/min_length": 730.0,
+      "completions/min_terminated_length": 730.0,
+      "entropy": 0.9700981751084328,
+      "epoch": 0.25574977000919963,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005685295443981886,
+      "learning_rate": 1e-05,
+      "loss": -0.0056,
+      "num_tokens": 240156211.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998887777328491,
+      "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05,
+      "sampling/sampling_logp_difference/max": 9.997581481933594,
+      "sampling/sampling_logp_difference/mean": 0.021195171400904655,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.9186837764427764e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9186837764427764e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15469.0,
+      "completions/mean_length": 5227.53125,
+      "completions/mean_terminated_length": 5139.68505859375,
+      "completions/min_length": 647.0,
+      "completions/min_terminated_length": 647.0,
+      "entropy": 0.9116031974554062,
+      "epoch": 0.25666973321067155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003880272386595607,
+      "learning_rate": 1e-05,
+      "loss": 0.1246,
+      "num_tokens": 240845295.0,
+      "reward": 0.6328125,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.6328125,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000362396240234,
+      "sampling/importance_sampling_ratio/min": 0.00012422871077433228,
+      "sampling/sampling_logp_difference/max": 8.993386268615723,
+      "sampling/sampling_logp_difference/mean": 0.018801718950271606,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 2.5015486926349695e-05,
+      "clip_ratio/high_mean": 8.084949570275057e-06,
+      "clip_ratio/low_mean": 5.524710468307603e-05,
+      "clip_ratio/low_min": 3.776891389861703e-06,
+      "clip_ratio/region_mean": 6.333205465125502e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 8065.4765625,
+      "completions/mean_terminated_length": 7510.90869140625,
+      "completions/min_length": 1055.0,
+      "completions/min_terminated_length": 1055.0,
+      "entropy": 0.7446574792265892,
+      "epoch": 0.2575896964121435,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0028986844699829817,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 241895676.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3474721610546112,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999842643737793,
+      "sampling/importance_sampling_ratio/min": 0.0017039099475368857,
+      "sampling/sampling_logp_difference/max": 6.3748297691345215,
+      "sampling/sampling_logp_difference/mean": 0.01853121444582939,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 9.486341014053323e-06,
+      "clip_ratio/high_mean": 2.371585253513331e-06,
+      "clip_ratio/low_mean": 2.896106741445692e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.133265261112683e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15534.0,
+      "completions/max_terminated_length": 15534.0,
+      "completions/mean_length": 6127.359375,
+      "completions/mean_terminated_length": 6127.359375,
+      "completions/min_length": 848.0,
+      "completions/min_terminated_length": 848.0,
+      "entropy": 0.8569132760167122,
+      "epoch": 0.25850965961361544,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003845847910270095,
+      "learning_rate": 1e-05,
+      "loss": 0.0368,
+      "num_tokens": 242698258.0,
+      "reward": 0.53125,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000942945480347,
+      "sampling/importance_sampling_ratio/min": 0.00043231461313553154,
+      "sampling/sampling_logp_difference/max": 7.746356964111328,
+      "sampling/sampling_logp_difference/mean": 0.01856958493590355,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 2.9848330086679198e-05,
+      "clip_ratio/high_mean": 7.4620825216697995e-06,
+      "clip_ratio/low_mean": 4.3558867673709756e-05,
+      "clip_ratio/low_min": 4.417741820361698e-06,
+      "clip_ratio/region_mean": 5.1020949285884853e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15192.0,
+      "completions/mean_length": 6600.1484375,
+      "completions/mean_terminated_length": 6365.33642578125,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.78924310952425,
+      "epoch": 0.2594296228150874,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003953634761273861,
+      "learning_rate": 1e-05,
+      "loss": 0.0666,
+      "num_tokens": 243560957.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999537467956543,
+      "sampling/importance_sampling_ratio/min": 0.0006525487406179309,
+      "sampling/sampling_logp_difference/max": 7.334624767303467,
+      "sampling/sampling_logp_difference/mean": 0.018097909167408943,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 6.635561703660642e-06,
+      "clip_ratio/high_mean": 1.6588904259151604e-06,
+      "clip_ratio/low_mean": 2.737523408313791e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9034124281679397e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15755.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7852.171875,
+      "completions/mean_terminated_length": 7852.171875,
+      "completions/min_length": 1276.0,
+      "completions/min_terminated_length": 1276.0,
+      "entropy": 1.0598893761634827,
+      "epoch": 0.26034958601655933,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00360781978815794,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 244585923.0,
+      "reward": 0.3125,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05,
+      "sampling/sampling_logp_difference/max": 10.076086044311523,
+      "sampling/sampling_logp_difference/mean": 0.022330068051815033,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 3.1540168947685743e-06,
+      "clip_ratio/high_mean": 7.885042236921436e-07,
+      "clip_ratio/low_mean": 4.7973388973332476e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.876189268543385e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7972.2265625,
+      "completions/mean_terminated_length": 7700.87890625,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.933217465877533,
+      "epoch": 0.2612695492180313,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0027661293279379606,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 245628064.0,
+      "reward": 0.28125,
+      "reward_std": 0.1872510462999344,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999428987503052,
+      "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05,
+      "sampling/sampling_logp_difference/max": 10.366576194763184,
+      "sampling/sampling_logp_difference/mean": 0.021125148981809616,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 1.2965969062861404e-05,
+      "clip_ratio/high_mean": 3.241492265715351e-06,
+      "clip_ratio/low_mean": 4.6317693090713874e-05,
+      "clip_ratio/low_min": 3.820877282123547e-06,
+      "clip_ratio/region_mean": 4.955918507221213e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15744.0,
+      "completions/mean_length": 7135.6953125,
+      "completions/mean_terminated_length": 6913.736328125,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "entropy": 0.7786942347884178,
+      "epoch": 0.2621895124195032,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005680318456143141,
+      "learning_rate": 1e-05,
+      "loss": 0.0786,
+      "num_tokens": 246561329.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3077537715435028,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999462366104126,
+      "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05,
+      "sampling/sampling_logp_difference/max": 9.737424850463867,
+      "sampling/sampling_logp_difference/mean": 0.018504241481423378,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.22437145175536e-05,
+      "clip_ratio/low_min": 1.4025082009538892e-05,
+      "clip_ratio/region_mean": 4.22437145175536e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16161.0,
+      "completions/mean_length": 6704.046875,
+      "completions/mean_terminated_length": 6627.82666015625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "entropy": 1.0435140281915665,
+      "epoch": 0.26310947562097514,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026402862276881933,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 247437415.0,
+      "reward": 0.3828125,
+      "reward_std": 0.31276631355285645,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998904466629028,
+      "sampling/importance_sampling_ratio/min": 0.0007800163584761322,
+      "sampling/sampling_logp_difference/max": 7.156195640563965,
+      "sampling/sampling_logp_difference/mean": 0.02134273201227188,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 2.223430897174694e-05,
+      "clip_ratio/high_mean": 6.8746438159905665e-06,
+      "clip_ratio/low_mean": 4.7084630978133646e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3959275192028144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15708.0,
+      "completions/mean_length": 5892.5078125,
+      "completions/mean_terminated_length": 5725.9765625,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "entropy": 0.8004944771528244,
+      "epoch": 0.2640294388224471,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003993614576756954,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 248211112.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0024652592837810516,
+      "sampling/sampling_logp_difference/max": 6.005458354949951,
+      "sampling/sampling_logp_difference/mean": 0.01924925297498703,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 2.1833082200828358e-05,
+      "clip_ratio/high_mean": 5.458270550207089e-06,
+      "clip_ratio/low_mean": 3.415995615796419e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.961822596920683e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 7812.140625,
+      "completions/mean_terminated_length": 7316.24755859375,
+      "completions/min_length": 1515.0,
+      "completions/min_terminated_length": 1515.0,
+      "entropy": 0.8841542899608612,
+      "epoch": 0.26494940202391903,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001573400106281042,
+      "learning_rate": 1e-05,
+      "loss": 0.0823,
+      "num_tokens": 249228106.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2767002284526825,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 0.001001527882181108,
+      "sampling/sampling_logp_difference/max": 6.906228542327881,
+      "sampling/sampling_logp_difference/mean": 0.01956877112388611,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 1.014439021673752e-05,
+      "clip_ratio/high_mean": 2.53609755418438e-06,
+      "clip_ratio/low_mean": 3.068193461785995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.321803217204433e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 6372.953125,
+      "completions/mean_terminated_length": 6132.6884765625,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.8228401988744736,
+      "epoch": 0.265869365225391,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0021125099156051874,
+      "learning_rate": 1e-05,
+      "loss": 0.0438,
+      "num_tokens": 250063284.0,
+      "reward": 0.5,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999308586120605,
+      "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05,
+      "sampling/sampling_logp_difference/max": 9.937475204467773,
+      "sampling/sampling_logp_difference/mean": 0.01943521574139595,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 7.023906164249638e-06,
+      "clip_ratio/high_mean": 1.7559765410624095e-06,
+      "clip_ratio/low_mean": 2.526416994896863e-05,
+      "clip_ratio/low_min": 6.7760895490209805e-06,
+      "clip_ratio/region_mean": 2.7020146660561295e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16270.0,
+      "completions/mean_length": 7817.8671875,
+      "completions/mean_terminated_length": 7396.58154296875,
+      "completions/min_length": 1568.0,
+      "completions/min_terminated_length": 1568.0,
+      "entropy": 0.9454319775104523,
+      "epoch": 0.2667893284268629,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022315154783427715,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 251085123.0,
+      "reward": 0.40625,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06,
+      "sampling/sampling_logp_difference/max": 12.760490417480469,
+      "sampling/sampling_logp_difference/mean": 0.021764669567346573,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 1.4797966287005693e-05,
+      "clip_ratio/high_mean": 3.699491571751423e-06,
+      "clip_ratio/low_mean": 4.36271948274225e-05,
+      "clip_ratio/low_min": 3.6957101201551268e-06,
+      "clip_ratio/region_mean": 4.732668639917392e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16352.0,
+      "completions/mean_length": 7168.4921875,
+      "completions/mean_terminated_length": 6635.36328125,
+      "completions/min_length": 817.0,
+      "completions/min_terminated_length": 817.0,
+      "entropy": 0.8433891162276268,
+      "epoch": 0.26770929162833484,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004663965664803982,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 252020906.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999589920043945,
+      "sampling/importance_sampling_ratio/min": 0.0003851866349577904,
+      "sampling/sampling_logp_difference/max": 7.861782550811768,
+      "sampling/sampling_logp_difference/mean": 0.01929781585931778,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 1.996871560550062e-05,
+      "clip_ratio/high_mean": 6.089093403716106e-06,
+      "clip_ratio/low_mean": 4.2792244585143635e-05,
+      "clip_ratio/low_min": 1.0337215371691855e-05,
+      "clip_ratio/region_mean": 4.8881338216233416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7322.5078125,
+      "completions/mean_terminated_length": 6876.8603515625,
+      "completions/min_length": 1196.0,
+      "completions/min_terminated_length": 1196.0,
+      "entropy": 0.9157031401991844,
+      "epoch": 0.2686292548298068,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036942458245903254,
+      "learning_rate": 1e-05,
+      "loss": 0.079,
+      "num_tokens": 252977435.0,
+      "reward": 0.3359375,
+      "reward_std": 0.24275577068328857,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999804496765137,
+      "sampling/importance_sampling_ratio/min": 0.00029605376766994596,
+      "sampling/sampling_logp_difference/max": 8.124969482421875,
+      "sampling/sampling_logp_difference/mean": 0.0205365102738142,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.631919460327481e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.631919460327481e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16078.0,
+      "completions/mean_length": 7025.484375,
+      "completions/mean_terminated_length": 6723.5966796875,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 1.1329731941223145,
+      "epoch": 0.26954921803127874,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034127074759453535,
+      "learning_rate": 1e-05,
+      "loss": 0.0227,
+      "num_tokens": 253896161.0,
+      "reward": 0.25,
+      "reward_std": 0.27722424268722534,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999400973320007,
+      "sampling/importance_sampling_ratio/min": 0.0005197672289796174,
+      "sampling/sampling_logp_difference/max": 7.562129497528076,
+      "sampling/sampling_logp_difference/mean": 0.023741140961647034,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 4.368643658381188e-06,
+      "clip_ratio/high_mean": 1.092160914595297e-06,
+      "clip_ratio/low_mean": 2.4661783299961826e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5753944555617636e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13776.0,
+      "completions/mean_length": 5996.1796875,
+      "completions/mean_terminated_length": 5661.08837890625,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8773328885436058,
+      "epoch": 0.2704691812327507,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003959407564252615,
+      "learning_rate": 1e-05,
+      "loss": 0.0156,
+      "num_tokens": 254690264.0,
+      "reward": 0.53125,
+      "reward_std": 0.26645541191101074,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999563694000244,
+      "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07,
+      "sampling/sampling_logp_difference/max": 15.73043155670166,
+      "sampling/sampling_logp_difference/mean": 0.018407585099339485,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 1.616483677935321e-05,
+      "clip_ratio/high_mean": 4.041209194838302e-06,
+      "clip_ratio/low_mean": 3.736187466074625e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.140308453770558e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16383.0,
+      "completions/mean_length": 7165.328125,
+      "completions/mean_terminated_length": 6867.951171875,
+      "completions/min_length": 1115.0,
+      "completions/min_terminated_length": 1115.0,
+      "entropy": 0.9502597972750664,
+      "epoch": 0.27138914443422263,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0030910037457942963,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 255626394.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000731945037842,
+      "sampling/importance_sampling_ratio/min": 0.00022311302018351853,
+      "sampling/sampling_logp_difference/max": 8.407832145690918,
+      "sampling/sampling_logp_difference/mean": 0.020668907091021538,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 1.1702686606440693e-05,
+      "clip_ratio/high_mean": 2.9256716516101733e-06,
+      "clip_ratio/low_mean": 5.5247357522603124e-05,
+      "clip_ratio/low_min": 3.6811261452385224e-06,
+      "clip_ratio/region_mean": 5.8173028264718596e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15375.0,
+      "completions/mean_length": 8001.9296875,
+      "completions/mean_terminated_length": 7661.34912109375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "entropy": 0.8591345250606537,
+      "epoch": 0.27230910763569455,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037233952898532152,
+      "learning_rate": 1e-05,
+      "loss": 0.0463,
+      "num_tokens": 256673457.0,
+      "reward": 0.421875,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999151229858398,
+      "sampling/importance_sampling_ratio/min": 0.0021876997780054808,
+      "sampling/sampling_logp_difference/max": 6.124904632568359,
+      "sampling/sampling_logp_difference/mean": 0.020540472120046616,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 3.721341136042611e-05,
+      "clip_ratio/high_mean": 1.2759249216287571e-05,
+      "clip_ratio/low_mean": 3.570647322703735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.846572301175911e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 6924.84375,
+      "completions/mean_terminated_length": 6697.82421875,
+      "completions/min_length": 803.0,
+      "completions/min_terminated_length": 803.0,
+      "entropy": 0.7969356626272202,
+      "epoch": 0.2732290708371665,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006054217461496592,
+      "learning_rate": 1e-05,
+      "loss": 0.0669,
+      "num_tokens": 257578501.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2927239239215851,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999213218688965,
+      "sampling/importance_sampling_ratio/min": 0.007889713160693645,
+      "sampling/sampling_logp_difference/max": 4.842195510864258,
+      "sampling/sampling_logp_difference/mean": 0.019306108355522156,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 1.0211543894911301e-05,
+      "clip_ratio/high_mean": 2.5528859737278253e-06,
+      "clip_ratio/low_mean": 5.2388056587915344e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4940942732173426e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14439.0,
+      "completions/mean_length": 6203.03125,
+      "completions/mean_terminated_length": 5958.6884765625,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "entropy": 0.8734413683414459,
+      "epoch": 0.27414903403863844,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004903806839138269,
+      "learning_rate": 1e-05,
+      "loss": 0.0689,
+      "num_tokens": 258392625.0,
+      "reward": 0.4453125,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999826550483704,
+      "sampling/importance_sampling_ratio/min": 0.00020370795391499996,
+      "sampling/sampling_logp_difference/max": 8.498823165893555,
+      "sampling/sampling_logp_difference/mean": 0.01909301057457924,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 1.5135058674786706e-05,
+      "clip_ratio/high_mean": 4.64845766146027e-06,
+      "clip_ratio/low_mean": 4.373456977191381e-05,
+      "clip_ratio/low_min": 3.670856358439778e-06,
+      "clip_ratio/region_mean": 4.8383026296505705e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15554.0,
+      "completions/mean_length": 7982.5390625,
+      "completions/mean_terminated_length": 7641.01611328125,
+      "completions/min_length": 776.0,
+      "completions/min_terminated_length": 776.0,
+      "entropy": 1.0091779381036758,
+      "epoch": 0.2750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033637424930930138,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "num_tokens": 259435270.0,
+      "reward": 0.359375,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999765753746033,
+      "sampling/importance_sampling_ratio/min": 0.0016514655435457826,
+      "sampling/sampling_logp_difference/max": 6.406092166900635,
+      "sampling/sampling_logp_difference/mean": 0.02182736061513424,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 2.3964702677403693e-05,
+      "clip_ratio/high_mean": 5.991175669350923e-06,
+      "clip_ratio/low_mean": 5.2442986770984135e-05,
+      "clip_ratio/low_min": 8.75736759553547e-06,
+      "clip_ratio/region_mean": 5.843416238349164e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16265.0,
+      "completions/mean_length": 6915.3125,
+      "completions/mean_terminated_length": 6688.064453125,
+      "completions/min_length": 778.0,
+      "completions/min_terminated_length": 778.0,
+      "entropy": 0.7964543774724007,
+      "epoch": 0.27598896044158233,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052203768864274025,
+      "learning_rate": 1e-05,
+      "loss": 0.144,
+      "num_tokens": 260337614.0,
+      "reward": 0.46875,
+      "reward_std": 0.37928223609924316,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999016523361206,
+      "sampling/importance_sampling_ratio/min": 7.032832218101248e-05,
+      "sampling/sampling_logp_difference/max": 9.562335968017578,
+      "sampling/sampling_logp_difference/mean": 0.017896221950650215,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 4.458271632756805e-05,
+      "clip_ratio/high_mean": 1.1145679081892013e-05,
+      "clip_ratio/low_mean": 6.243192206056847e-05,
+      "clip_ratio/low_min": 1.2397775662975619e-05,
+      "clip_ratio/region_mean": 7.357759886872373e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16198.0,
+      "completions/mean_length": 7029.4375,
+      "completions/mean_terminated_length": 6880.95263671875,
+      "completions/min_length": 679.0,
+      "completions/min_terminated_length": 679.0,
+      "entropy": 0.8605096861720085,
+      "epoch": 0.2769089236430543,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005570738110691309,
+      "learning_rate": 1e-05,
+      "loss": 0.0984,
+      "num_tokens": 261254070.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3327290117740631,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999494552612305,
+      "sampling/importance_sampling_ratio/min": 0.0009070249507203698,
+      "sampling/sampling_logp_difference/max": 7.005340576171875,
+      "sampling/sampling_logp_difference/mean": 0.01905740052461624,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 3.390461233720998e-05,
+      "clip_ratio/high_mean": 1.1191766247975465e-05,
+      "clip_ratio/low_mean": 7.46641262594494e-05,
+      "clip_ratio/low_min": 5.041745680500753e-06,
+      "clip_ratio/region_mean": 8.585589102949598e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15987.0,
+      "completions/mean_length": 5858.84375,
+      "completions/mean_terminated_length": 5606.240234375,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.8430554121732712,
+      "epoch": 0.2778288868445262,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004496110137552023,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "num_tokens": 262024906.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999294877052307,
+      "sampling/importance_sampling_ratio/min": 0.00040469475788995624,
+      "sampling/sampling_logp_difference/max": 7.812377452850342,
+      "sampling/sampling_logp_difference/mean": 0.019225869327783585,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 3.2563955301156966e-06,
+      "clip_ratio/high_mean": 8.140988825289242e-07,
+      "clip_ratio/low_mean": 3.7080020149460324e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.789411886145899e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15976.0,
+      "completions/mean_length": 8337.328125,
+      "completions/mean_terminated_length": 7728.7568359375,
+      "completions/min_length": 837.0,
+      "completions/min_terminated_length": 837.0,
+      "entropy": 0.901745393872261,
+      "epoch": 0.27874885004599814,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00348713924176991,
+      "learning_rate": 1e-05,
+      "loss": -0.0002,
+      "num_tokens": 263110844.0,
+      "reward": 0.296875,
+      "reward_std": 0.20805485546588898,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998900890350342,
+      "sampling/importance_sampling_ratio/min": 0.0022652465850114822,
+      "sampling/sampling_logp_difference/max": 6.090071678161621,
+      "sampling/sampling_logp_difference/mean": 0.02157524600625038,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 2.3739744847262045e-05,
+      "clip_ratio/high_mean": 5.934936211815511e-06,
+      "clip_ratio/low_mean": 2.823553325015382e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.417046866616147e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16315.0,
+      "completions/mean_length": 7084.7265625,
+      "completions/mean_terminated_length": 6381.42041015625,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8265534415841103,
+      "epoch": 0.2796688132474701,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003980033565312624,
+      "learning_rate": 1e-05,
+      "loss": 0.0551,
+      "num_tokens": 264036169.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27434611320495605,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999673366546631,
+      "sampling/importance_sampling_ratio/min": 0.00012345099821686745,
+      "sampling/sampling_logp_difference/max": 8.999666213989258,
+      "sampling/sampling_logp_difference/mean": 0.018782664090394974,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 1.1745505617000163e-05,
+      "clip_ratio/high_mean": 3.771558226617344e-06,
+      "clip_ratio/low_mean": 6.913120819262986e-05,
+      "clip_ratio/low_min": 2.494283216947224e-05,
+      "clip_ratio/region_mean": 7.290276607818669e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16292.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 6543.796875,
+      "completions/mean_terminated_length": 6543.796875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "entropy": 0.8899869695305824,
+      "epoch": 0.28058877644894203,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.006467343773692846,
+      "learning_rate": 1e-05,
+      "loss": 0.1139,
+      "num_tokens": 264892767.0,
+      "reward": 0.484375,
+      "reward_std": 0.3934885561466217,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000489950180054,
+      "sampling/importance_sampling_ratio/min": 9.891482477542013e-05,
+      "sampling/sampling_logp_difference/max": 9.221251487731934,
+      "sampling/sampling_logp_difference/mean": 0.02032080665230751,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.395576979732141e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.395576979732141e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16307.0,
+      "completions/mean_length": 8483.390625,
+      "completions/mean_terminated_length": 7813.84765625,
+      "completions/min_length": 1342.0,
+      "completions/min_terminated_length": 1342.0,
+      "entropy": 0.9621479511260986,
+      "epoch": 0.281508739650414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003174177836626768,
+      "learning_rate": 1e-05,
+      "loss": 0.0948,
+      "num_tokens": 265995697.0,
+      "reward": 0.3359375,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000269412994385,
+      "sampling/importance_sampling_ratio/min": 0.0005628522485494614,
+      "sampling/sampling_logp_difference/max": 7.4824934005737305,
+      "sampling/sampling_logp_difference/mean": 0.02145479805767536,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 1.2596524811669951e-05,
+      "clip_ratio/high_mean": 3.149131202917488e-06,
+      "clip_ratio/low_mean": 3.7911659774181317e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.106079018129094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14985.0,
+      "completions/mean_length": 7184.578125,
+      "completions/mean_terminated_length": 6963.79248046875,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.9993807673454285,
+      "epoch": 0.2824287028518859,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003356153378263116,
+      "learning_rate": 1e-05,
+      "loss": 0.0887,
+      "num_tokens": 266937707.0,
+      "reward": 0.3828125,
+      "reward_std": 0.25566399097442627,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000238418579102,
+      "sampling/importance_sampling_ratio/min": 0.0017036627978086472,
+      "sampling/sampling_logp_difference/max": 6.374974727630615,
+      "sampling/sampling_logp_difference/mean": 0.02204768732190132,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 1.9245163684900035e-05,
+      "clip_ratio/high_mean": 4.811290921225009e-06,
+      "clip_ratio/low_mean": 4.8845648166206956e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.365693925796222e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16216.0,
+      "completions/mean_length": 7029.2265625,
+      "completions/mean_terminated_length": 6727.45947265625,
+      "completions/min_length": 851.0,
+      "completions/min_terminated_length": 851.0,
+      "entropy": 0.9139953926205635,
+      "epoch": 0.28334866605335784,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006375293247401714,
+      "learning_rate": 1e-05,
+      "loss": 0.0519,
+      "num_tokens": 267853880.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27328038215637207,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000023365020752,
+      "sampling/importance_sampling_ratio/min": 0.010649868287146091,
+      "sampling/sampling_logp_difference/max": 4.542207717895508,
+      "sampling/sampling_logp_difference/mean": 0.020365029573440552,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 4.812504812434781e-06,
+      "clip_ratio/high_mean": 1.2031262031086953e-06,
+      "clip_ratio/low_mean": 2.5999243803198624e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.720237000630732e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16106.0,
+      "completions/mean_length": 6188.0078125,
+      "completions/mean_terminated_length": 5943.30419921875,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.7640773430466652,
+      "epoch": 0.2842686292548298,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003697809297591448,
+      "learning_rate": 1e-05,
+      "loss": 0.0733,
+      "num_tokens": 268665721.0,
+      "reward": 0.5078125,
+      "reward_std": 0.20699402689933777,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372363090515,
+      "sampling/importance_sampling_ratio/min": 0.02927250787615776,
+      "sampling/sampling_logp_difference/max": 3.531106472015381,
+      "sampling/sampling_logp_difference/mean": 0.016581017524003983,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.1358927824621787e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1358927824621787e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16264.0,
+      "completions/mean_length": 8128.21875,
+      "completions/mean_terminated_length": 7861.90283203125,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.8218234181404114,
+      "epoch": 0.28518859245630174,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002286596456542611,
+      "learning_rate": 1e-05,
+      "loss": 0.0763,
+      "num_tokens": 269726181.0,
+      "reward": 0.375,
+      "reward_std": 0.24435341358184814,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999798536300659,
+      "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06,
+      "sampling/sampling_logp_difference/max": 12.90043830871582,
+      "sampling/sampling_logp_difference/mean": 0.019403984770178795,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 1.4808477317274082e-05,
+      "clip_ratio/high_mean": 3.7021193293185206e-06,
+      "clip_ratio/low_mean": 3.0363167581981543e-05,
+      "clip_ratio/low_min": 6.364238288369961e-06,
+      "clip_ratio/region_mean": 3.4065286854456645e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16256.0,
+      "completions/mean_length": 5673.3359375,
+      "completions/mean_terminated_length": 5503.32568359375,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.9275510385632515,
+      "epoch": 0.2861085556577737,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00485506234690547,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 270470616.0,
+      "reward": 0.4921875,
+      "reward_std": 0.25354230403900146,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 0.0009123464697040617,
+      "sampling/sampling_logp_difference/max": 6.999490737915039,
+      "sampling/sampling_logp_difference/mean": 0.01881871558725834,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 1.1274602456978755e-05,
+      "clip_ratio/high_mean": 3.6739949109687586e-06,
+      "clip_ratio/low_mean": 3.968570712231667e-05,
+      "clip_ratio/low_min": 3.4213767321489286e-06,
+      "clip_ratio/region_mean": 4.335970191959859e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16382.0,
+      "completions/mean_length": 6944.8984375,
+      "completions/mean_terminated_length": 6795.07177734375,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9335741624236107,
+      "epoch": 0.28702851885924563,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005874342750757933,
+      "learning_rate": 1e-05,
+      "loss": 0.032,
+      "num_tokens": 271377723.0,
+      "reward": 0.390625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000594854354858,
+      "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05,
+      "sampling/sampling_logp_difference/max": 10.049861907958984,
+      "sampling/sampling_logp_difference/mean": 0.020590776577591896,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 1.264126694877632e-05,
+      "clip_ratio/high_mean": 3.16031673719408e-06,
+      "clip_ratio/low_mean": 3.206376845810155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.522408474054828e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15806.0,
+      "completions/mean_length": 7705.625,
+      "completions/mean_terminated_length": 7278.8193359375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.8491624072194099,
+      "epoch": 0.28794848206071755,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.001684082904830575,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 272384891.0,
+      "reward": 0.390625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 6.605865200981498e-05,
+      "sampling/sampling_logp_difference/max": 9.624967575073242,
+      "sampling/sampling_logp_difference/mean": 0.020136822015047073,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 9.772357770998497e-06,
+      "clip_ratio/high_mean": 2.443089442749624e-06,
+      "clip_ratio/low_mean": 3.8573590472879005e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.101667946088128e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15923.0,
+      "completions/mean_length": 6611.1484375,
+      "completions/mean_terminated_length": 6534.19677734375,
+      "completions/min_length": 1116.0,
+      "completions/min_terminated_length": 1116.0,
+      "entropy": 0.8867302760481834,
+      "epoch": 0.2888684452621895,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003692191792652011,
+      "learning_rate": 1e-05,
+      "loss": 0.1233,
+      "num_tokens": 273251630.0,
+      "reward": 0.3984375,
+      "reward_std": 0.27564430236816406,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999606609344482,
+      "sampling/importance_sampling_ratio/min": 0.0031062732450664043,
+      "sampling/sampling_logp_difference/max": 5.774331569671631,
+      "sampling/sampling_logp_difference/mean": 0.019237037748098373,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 3.0103737344688852e-05,
+      "clip_ratio/high_mean": 9.664363972206047e-06,
+      "clip_ratio/low_mean": 1.7575501146893657e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.723986426644842e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15786.0,
+      "completions/max_terminated_length": 15786.0,
+      "completions/mean_length": 6770.46875,
+      "completions/mean_terminated_length": 6770.46875,
+      "completions/min_length": 957.0,
+      "completions/min_terminated_length": 957.0,
+      "entropy": 0.8252957463264465,
+      "epoch": 0.28978840846366144,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004167635925114155,
+      "learning_rate": 1e-05,
+      "loss": -0.0072,
+      "num_tokens": 274146482.0,
+      "reward": 0.5703125,
+      "reward_std": 0.23486016690731049,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000013828277588,
+      "sampling/importance_sampling_ratio/min": 0.00010247006866848096,
+      "sampling/sampling_logp_difference/max": 9.18593978881836,
+      "sampling/sampling_logp_difference/mean": 0.019684650003910065,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 6.529460733872838e-06,
+      "clip_ratio/high_mean": 1.6323651834682096e-06,
+      "clip_ratio/low_mean": 3.877351048231503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.040587566578324e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15827.0,
+      "completions/mean_length": 8210.859375,
+      "completions/mean_terminated_length": 7365.36181640625,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.8118235394358635,
+      "epoch": 0.2907083716651334,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0030363225378096104,
+      "learning_rate": 1e-05,
+      "loss": 0.0531,
+      "num_tokens": 275214040.0,
+      "reward": 0.3515625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998943209648132,
+      "sampling/importance_sampling_ratio/min": 0.002854935359209776,
+      "sampling/sampling_logp_difference/max": 5.858705997467041,
+      "sampling/sampling_logp_difference/mean": 0.019275270402431488,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 7.0800629146106075e-06,
+      "clip_ratio/high_mean": 1.7700157286526519e-06,
+      "clip_ratio/low_mean": 2.3981688286767167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5751703674359305e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14900.0,
+      "completions/mean_length": 7072.8828125,
+      "completions/mean_terminated_length": 6849.41650390625,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.8018335327506065,
+      "epoch": 0.29162833486660533,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004777858033776283,
+      "learning_rate": 1e-05,
+      "loss": 0.0404,
+      "num_tokens": 276138049.0,
+      "reward": 0.453125,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368190765381,
+      "sampling/importance_sampling_ratio/min": 0.0028502768836915493,
+      "sampling/sampling_logp_difference/max": 5.860339164733887,
+      "sampling/sampling_logp_difference/mean": 0.01849908009171486,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 2.259368602608447e-05,
+      "clip_ratio/high_mean": 5.648421506521117e-06,
+      "clip_ratio/low_mean": 4.28424866640853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.849090737479855e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14447.0,
+      "completions/mean_length": 5889.8359375,
+      "completions/mean_terminated_length": 5723.26220703125,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.7976400703191757,
+      "epoch": 0.29254829806807725,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030593445990234613,
+      "learning_rate": 1e-05,
+      "loss": 0.1331,
+      "num_tokens": 276910124.0,
+      "reward": 0.5859375,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999091029167175,
+      "sampling/importance_sampling_ratio/min": 0.000139843366923742,
+      "sampling/sampling_logp_difference/max": 8.874987602233887,
+      "sampling/sampling_logp_difference/mean": 0.01834402233362198,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 1.4654247024736833e-05,
+      "clip_ratio/high_mean": 3.663561756184208e-06,
+      "clip_ratio/low_mean": 2.377464920755301e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7438210736363544e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16324.0,
+      "completions/mean_length": 7144.265625,
+      "completions/mean_terminated_length": 6689.85205078125,
+      "completions/min_length": 1200.0,
+      "completions/min_terminated_length": 1200.0,
+      "entropy": 0.8309404999017715,
+      "epoch": 0.2934682612695492,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004245694726705551,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 277843542.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24541422724723816,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998534321784973,
+      "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05,
+      "sampling/sampling_logp_difference/max": 11.499897956848145,
+      "sampling/sampling_logp_difference/mean": 0.01875344291329384,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 6.252500952541595e-06,
+      "clip_ratio/high_mean": 2.241558604509919e-06,
+      "clip_ratio/low_mean": 4.735765514851664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9599213525652885e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15722.0,
+      "completions/mean_length": 6779.5234375,
+      "completions/mean_terminated_length": 6703.8974609375,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.9584890529513359,
+      "epoch": 0.29438822447102114,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0035574575886130333,
+      "learning_rate": 1e-05,
+      "loss": 0.0723,
+      "num_tokens": 278730129.0,
+      "reward": 0.3984375,
+      "reward_std": 0.32825323939323425,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.005792221520096064,
+      "sampling/sampling_logp_difference/max": 5.151239395141602,
+      "sampling/sampling_logp_difference/mean": 0.02137477695941925,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 3.2948471016425174e-05,
+      "clip_ratio/high_mean": 9.518853403278627e-06,
+      "clip_ratio/low_mean": 2.195712454522436e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.14759782895635e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15892.0,
+      "completions/max_terminated_length": 15892.0,
+      "completions/mean_length": 5582.9765625,
+      "completions/mean_terminated_length": 5582.9765625,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.8629376217722893,
+      "epoch": 0.2953081876724931,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0037982752546668053,
+      "learning_rate": 1e-05,
+      "loss": 0.0331,
+      "num_tokens": 279462542.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3164186477661133,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999780058860779,
+      "sampling/importance_sampling_ratio/min": 0.0021874974481761456,
+      "sampling/sampling_logp_difference/max": 6.124997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01906203106045723,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 1.1029473625967512e-05,
+      "clip_ratio/high_mean": 2.757368406491878e-06,
+      "clip_ratio/low_mean": 5.367386921761863e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6431237737797346e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 6942.2578125,
+      "completions/mean_terminated_length": 6477.90966796875,
+      "completions/min_length": 1156.0,
+      "completions/min_terminated_length": 1156.0,
+      "entropy": 0.8147861957550049,
+      "epoch": 0.29622815087396503,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0027678858023136854,
+      "learning_rate": 1e-05,
+      "loss": 0.0585,
+      "num_tokens": 280370207.0,
+      "reward": 0.4375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998471736907959,
+      "sampling/importance_sampling_ratio/min": 0.00023058800434228033,
+      "sampling/sampling_logp_difference/max": 8.3748779296875,
+      "sampling/sampling_logp_difference/mean": 0.01940828748047352,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 2.6367894406575942e-05,
+      "clip_ratio/high_mean": 8.765707434577052e-06,
+      "clip_ratio/low_mean": 3.232976985145797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.109547796815605e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15782.0,
+      "completions/mean_length": 6242.53125,
+      "completions/mean_terminated_length": 5915.38671875,
+      "completions/min_length": 1220.0,
+      "completions/min_terminated_length": 1220.0,
+      "entropy": 0.878915011882782,
+      "epoch": 0.297148114075437,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00577945914119482,
+      "learning_rate": 1e-05,
+      "loss": 0.0839,
+      "num_tokens": 281189491.0,
+      "reward": 0.515625,
+      "reward_std": 0.2398776262998581,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 9.611724817659706e-05,
+      "sampling/sampling_logp_difference/max": 9.2499418258667,
+      "sampling/sampling_logp_difference/mean": 0.01948760263621807,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 3.50839609382092e-05,
+      "clip_ratio/high_mean": 1.1664920634757436e-05,
+      "clip_ratio/low_mean": 1.833109013205103e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9996010880495305e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 7004.015625,
+      "completions/mean_terminated_length": 6622.71533203125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "entropy": 0.7964659407734871,
+      "epoch": 0.2980680772769089,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014128695474937558,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 282103997.0,
+      "reward": 0.4140625,
+      "reward_std": 0.21778053045272827,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.0024504722096025944,
+      "sampling/sampling_logp_difference/max": 6.011474609375,
+      "sampling/sampling_logp_difference/mean": 0.019019678235054016,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 1.832260545597819e-05,
+      "clip_ratio/high_mean": 4.580651363994548e-06,
+      "clip_ratio/low_mean": 5.309064226821647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.767129368905444e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7822.6953125,
+      "completions/mean_terminated_length": 7546.52392578125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.8571138679981232,
+      "epoch": 0.29898804047838085,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002476039342582226,
+      "learning_rate": 1e-05,
+      "loss": 0.0515,
+      "num_tokens": 283122382.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999314546585083,
+      "sampling/importance_sampling_ratio/min": 0.0009774373611435294,
+      "sampling/sampling_logp_difference/max": 6.930576324462891,
+      "sampling/sampling_logp_difference/mean": 0.020557202398777008,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 5.738419986300869e-06,
+      "clip_ratio/high_mean": 1.4346049965752172e-06,
+      "clip_ratio/low_mean": 4.19679121819172e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3402517292179255e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7738.8984375,
+      "completions/mean_terminated_length": 6844.57763671875,
+      "completions/min_length": 897.0,
+      "completions/min_terminated_length": 897.0,
+      "entropy": 0.7839021533727646,
+      "epoch": 0.2999080036798528,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005309853237122297,
+      "learning_rate": 1e-05,
+      "loss": 0.043,
+      "num_tokens": 284130081.0,
+      "reward": 0.5234375,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998971223831177,
+      "sampling/importance_sampling_ratio/min": 0.0001319014554610476,
+      "sampling/sampling_logp_difference/max": 8.933455467224121,
+      "sampling/sampling_logp_difference/mean": 0.01873316988348961,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 1.007085802484653e-05,
+      "clip_ratio/high_mean": 2.5177145062116324e-06,
+      "clip_ratio/low_mean": 4.043528815600439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.295300277590286e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15952.0,
+      "completions/mean_length": 7102.2421875,
+      "completions/mean_terminated_length": 6954.9130859375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.8530801385641098,
+      "epoch": 0.30082796688132474,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004228116944432259,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 285058720.0,
+      "reward": 0.5078125,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999712705612183,
+      "sampling/importance_sampling_ratio/min": 0.00012956927821505815,
+      "sampling/sampling_logp_difference/max": 8.951294898986816,
+      "sampling/sampling_logp_difference/mean": 0.019325006753206253,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 4.06874551117653e-06,
+      "clip_ratio/high_mean": 1.0171863777941326e-06,
+      "clip_ratio/low_mean": 3.661125703047219e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.762844340826632e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15594.0,
+      "completions/max_terminated_length": 15594.0,
+      "completions/mean_length": 6583.4765625,
+      "completions/mean_terminated_length": 6583.4765625,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 1.021921381354332,
+      "epoch": 0.3017479300827967,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004967439454048872,
+      "learning_rate": 1e-05,
+      "loss": 0.0374,
+      "num_tokens": 285919765.0,
+      "reward": 0.328125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00004243850708,
+      "sampling/importance_sampling_ratio/min": 0.016675354912877083,
+      "sampling/sampling_logp_difference/max": 4.093823432922363,
+      "sampling/sampling_logp_difference/mean": 0.021393200382590294,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 1.2215251445013564e-05,
+      "clip_ratio/high_mean": 3.053812861253391e-06,
+      "clip_ratio/low_mean": 4.05305947879242e-05,
+      "clip_ratio/low_min": 4.215567059873138e-06,
+      "clip_ratio/region_mean": 4.358440742180392e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16299.0,
+      "completions/mean_length": 7770.5859375,
+      "completions/mean_terminated_length": 7346.97509765625,
+      "completions/min_length": 1040.0,
+      "completions/min_terminated_length": 1040.0,
+      "entropy": 1.0466903448104858,
+      "epoch": 0.30266789328426863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004189736675471067,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 286935512.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2369818240404129,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797344207764,
+      "sampling/importance_sampling_ratio/min": 0.011683559976518154,
+      "sampling/sampling_logp_difference/max": 4.449572563171387,
+      "sampling/sampling_logp_difference/mean": 0.021805983036756516,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 2.0567378214764176e-05,
+      "clip_ratio/high_mean": 5.141844553691044e-06,
+      "clip_ratio/low_mean": 1.8177100628236076e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3318944840866607e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15758.0,
+      "completions/mean_length": 5689.2421875,
+      "completions/mean_terminated_length": 5432.568359375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.7778806164860725,
+      "epoch": 0.30358785648574055,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0032866497058421373,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 287681943.0,
+      "reward": 0.640625,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.640625,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940812587738,
+      "sampling/importance_sampling_ratio/min": 0.00038077132194302976,
+      "sampling/sampling_logp_difference/max": 7.873311519622803,
+      "sampling/sampling_logp_difference/mean": 0.01789461076259613,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 3.109086901531555e-05,
+      "clip_ratio/high_mean": 7.772717253828887e-06,
+      "clip_ratio/low_mean": 3.1423560130861006e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.919627738468989e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13820.0,
+      "completions/mean_length": 6288.1875,
+      "completions/mean_terminated_length": 6127.93701171875,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "entropy": 0.7709921672940254,
+      "epoch": 0.3045078196872125,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023572889622300863,
+      "learning_rate": 1e-05,
+      "loss": 0.0746,
+      "num_tokens": 288506735.0,
+      "reward": 0.484375,
+      "reward_std": 0.3066929280757904,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474287033081,
+      "sampling/importance_sampling_ratio/min": 0.000430915504693985,
+      "sampling/sampling_logp_difference/max": 7.749598503112793,
+      "sampling/sampling_logp_difference/mean": 0.017407266423106194,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 3.4638953366084024e-05,
+      "clip_ratio/high_mean": 9.51674803673086e-06,
+      "clip_ratio/low_mean": 6.26047980176736e-05,
+      "clip_ratio/low_min": 5.51267930859467e-06,
+      "clip_ratio/region_mean": 7.212154741864651e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16318.0,
+      "completions/mean_length": 6775.0234375,
+      "completions/mean_terminated_length": 6465.05615234375,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "entropy": 0.9338318258523941,
+      "epoch": 0.30542778288868444,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0034220058005303144,
+      "learning_rate": 1e-05,
+      "loss": 0.0986,
+      "num_tokens": 289395498.0,
+      "reward": 0.390625,
+      "reward_std": 0.34533774852752686,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999603033065796,
+      "sampling/importance_sampling_ratio/min": 0.0317598432302475,
+      "sampling/sampling_logp_difference/max": 3.449552536010742,
+      "sampling/sampling_logp_difference/mean": 0.019930530339479446,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 7.159989991123439e-05,
+      "clip_ratio/low_min": 1.5592839645250933e-05,
+      "clip_ratio/region_mean": 7.159989991123439e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 7142.9375,
+      "completions/mean_terminated_length": 6844.83837890625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 0.971405878663063,
+      "epoch": 0.3063477460901564,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002513247774913907,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 290329082.0,
+      "reward": 0.328125,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999737739562988,
+      "sampling/importance_sampling_ratio/min": 3.152207455059397e-07,
+      "sampling/sampling_logp_difference/max": 14.969992637634277,
+      "sampling/sampling_logp_difference/mean": 0.022366533055901527,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 1.6507752206962323e-05,
+      "clip_ratio/high_mean": 4.126938051740581e-06,
+      "clip_ratio/low_mean": 1.7493430505055585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.1620368215735652e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15581.0,
+      "completions/mean_length": 6412.2109375,
+      "completions/mean_terminated_length": 6333.69287109375,
+      "completions/min_length": 544.0,
+      "completions/min_terminated_length": 544.0,
+      "entropy": 0.9136044681072235,
+      "epoch": 0.30726770929162833,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0056767817586660385,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 291170133.0,
+      "reward": 0.421875,
+      "reward_std": 0.15650184452533722,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999720454216003,
+      "sampling/importance_sampling_ratio/min": 0.000458698661532253,
+      "sampling/sampling_logp_difference/max": 7.687117099761963,
+      "sampling/sampling_logp_difference/mean": 0.020012658089399338,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 8.26085442895419e-06,
+      "clip_ratio/high_mean": 2.0652136072385474e-06,
+      "clip_ratio/low_mean": 3.6938338666914206e-05,
+      "clip_ratio/low_min": 5.699044777429663e-06,
+      "clip_ratio/region_mean": 3.900355193309224e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16111.0,
+      "completions/mean_length": 8066.1015625,
+      "completions/mean_terminated_length": 7797.7822265625,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "entropy": 1.0789504647254944,
+      "epoch": 0.30818767249310025,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00243841833434999,
+      "learning_rate": 1e-05,
+      "loss": 0.0432,
+      "num_tokens": 292222082.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2688046097755432,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999664425849915,
+      "sampling/importance_sampling_ratio/min": 8.481895929435268e-05,
+      "sampling/sampling_logp_difference/max": 9.374991416931152,
+      "sampling/sampling_logp_difference/mean": 0.023650091141462326,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 5.320054697222076e-06,
+      "clip_ratio/high_mean": 1.330013674305519e-06,
+      "clip_ratio/low_mean": 1.9117383317279746e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0447396991585265e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15176.0,
+      "completions/mean_length": 6836.046875,
+      "completions/mean_terminated_length": 6606.896484375,
+      "completions/min_length": 785.0,
+      "completions/min_terminated_length": 785.0,
+      "entropy": 1.218759760260582,
+      "epoch": 0.3091076356945722,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0020856577903032303,
+      "learning_rate": 1e-05,
+      "loss": 0.0372,
+      "num_tokens": 293115984.0,
+      "reward": 0.21875,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.21875,
+      "rewards/accuracy_reward/std": 0.41502299904823303,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999911785125732,
+      "sampling/importance_sampling_ratio/min": 2.784526441246271e-05,
+      "sampling/sampling_logp_difference/max": 10.488847732543945,
+      "sampling/sampling_logp_difference/mean": 0.022012067958712578,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 2.5695502699818462e-05,
+      "clip_ratio/high_mean": 7.549717793153832e-06,
+      "clip_ratio/low_mean": 4.6741323160404136e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.429104089671455e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15796.0,
+      "completions/mean_length": 7501.9921875,
+      "completions/mean_terminated_length": 7140.9345703125,
+      "completions/min_length": 1237.0,
+      "completions/min_terminated_length": 1237.0,
+      "entropy": 0.8940394818782806,
+      "epoch": 0.31002759889604414,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005163854919373989,
+      "learning_rate": 1e-05,
+      "loss": 0.0354,
+      "num_tokens": 294099503.0,
+      "reward": 0.328125,
+      "reward_std": 0.30904707312583923,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999276399612427,
+      "sampling/importance_sampling_ratio/min": 0.0006545600481331348,
+      "sampling/sampling_logp_difference/max": 7.331547260284424,
+      "sampling/sampling_logp_difference/mean": 0.020813245326280594,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 3.1606674838258186e-05,
+      "clip_ratio/high_mean": 9.45794374729303e-06,
+      "clip_ratio/low_mean": 4.5567895540443715e-05,
+      "clip_ratio/low_min": 4.458871444512624e-06,
+      "clip_ratio/region_mean": 5.502583962879726e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16034.0,
+      "completions/mean_length": 7204.828125,
+      "completions/mean_terminated_length": 6908.7255859375,
+      "completions/min_length": 846.0,
+      "completions/min_terminated_length": 846.0,
+      "entropy": 0.9961872175335884,
+      "epoch": 0.3109475620975161,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029277894645929337,
+      "learning_rate": 1e-05,
+      "loss": 0.0963,
+      "num_tokens": 295042105.0,
+      "reward": 0.390625,
+      "reward_std": 0.28801077604293823,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000677108764648,
+      "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05,
+      "sampling/sampling_logp_difference/max": 10.872637748718262,
+      "sampling/sampling_logp_difference/mean": 0.020187582820653915,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 1.7963964182854397e-05,
+      "clip_ratio/high_mean": 5.194059781388205e-06,
+      "clip_ratio/low_mean": 1.8380221035840805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.357428081722901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15856.0,
+      "completions/mean_length": 6256.859375,
+      "completions/mean_terminated_length": 6013.80810546875,
+      "completions/min_length": 1006.0,
+      "completions/min_terminated_length": 1006.0,
+      "entropy": 0.9293600022792816,
+      "epoch": 0.31186752529898804,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032952844630926847,
+      "learning_rate": 1e-05,
+      "loss": 0.0473,
+      "num_tokens": 295867039.0,
+      "reward": 0.46875,
+      "reward_std": 0.24670752882957458,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999649524688721,
+      "sampling/importance_sampling_ratio/min": 7.995560008566827e-05,
+      "sampling/sampling_logp_difference/max": 9.434039115905762,
+      "sampling/sampling_logp_difference/mean": 0.019491540268063545,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 7.577551059512189e-06,
+      "clip_ratio/high_mean": 1.8943877648780472e-06,
+      "clip_ratio/low_mean": 2.7479814093567256e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9374201631071628e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15412.0,
+      "completions/mean_length": 7397.84375,
+      "completions/mean_terminated_length": 7032.552734375,
+      "completions/min_length": 923.0,
+      "completions/min_terminated_length": 923.0,
+      "entropy": 0.8508890569210052,
+      "epoch": 0.31278748850046,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029417150653898716,
+      "learning_rate": 1e-05,
+      "loss": 0.0621,
+      "num_tokens": 296832843.0,
+      "reward": 0.375,
+      "reward_std": 0.2867125868797302,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000183582305908,
+      "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05,
+      "sampling/sampling_logp_difference/max": 10.93724250793457,
+      "sampling/sampling_logp_difference/mean": 0.01975393109023571,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 3.281225508544594e-05,
+      "clip_ratio/high_mean": 1.3302957199812226e-05,
+      "clip_ratio/low_mean": 5.109179869577929e-05,
+      "clip_ratio/low_min": 6.657612175331451e-06,
+      "clip_ratio/region_mean": 6.439475532715733e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14983.0,
+      "completions/mean_length": 6897.765625,
+      "completions/mean_terminated_length": 6823.07080078125,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.9046694040298462,
+      "epoch": 0.3137074517019319,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0026788609102368355,
+      "learning_rate": 1e-05,
+      "loss": 0.0664,
+      "num_tokens": 297735285.0,
+      "reward": 0.421875,
+      "reward_std": 0.3266732692718506,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999909520149231,
+      "sampling/importance_sampling_ratio/min": 0.001710799871943891,
+      "sampling/sampling_logp_difference/max": 6.370794296264648,
+      "sampling/sampling_logp_difference/mean": 0.020578179508447647,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 1.7319889593636617e-05,
+      "clip_ratio/high_mean": 5.168538336874917e-06,
+      "clip_ratio/low_mean": 7.019768918326008e-05,
+      "clip_ratio/low_min": 2.541147478041239e-05,
+      "clip_ratio/region_mean": 7.53662266106403e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15525.0,
+      "completions/mean_length": 6971.9921875,
+      "completions/mean_terminated_length": 6509.10595703125,
+      "completions/min_length": 6.0,
+      "completions/min_terminated_length": 6.0,
+      "entropy": 0.8658201694488525,
+      "epoch": 0.31462741490340385,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005915141198784113,
+      "learning_rate": 1e-05,
+      "loss": 0.0923,
+      "num_tokens": 298645124.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3742823898792267,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999268651008606,
+      "sampling/importance_sampling_ratio/min": 0.000970841443631798,
+      "sampling/sampling_logp_difference/max": 6.937347412109375,
+      "sampling/sampling_logp_difference/mean": 0.01906151883304119,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 1.8332865238335216e-05,
+      "clip_ratio/high_mean": 4.583216309583804e-06,
+      "clip_ratio/low_mean": 6.167940273371642e-05,
+      "clip_ratio/low_min": 5.969151516183047e-06,
+      "clip_ratio/region_mean": 6.626261847486603e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15054.0,
+      "completions/mean_length": 6545.6953125,
+      "completions/mean_terminated_length": 5889.80859375,
+      "completions/min_length": 800.0,
+      "completions/min_terminated_length": 800.0,
+      "entropy": 0.779609851539135,
+      "epoch": 0.3155473781048758,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0032792428974062204,
+      "learning_rate": 1e-05,
+      "loss": 0.097,
+      "num_tokens": 299503781.0,
+      "reward": 0.609375,
+      "reward_std": 0.38293448090553284,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999361634254456,
+      "sampling/importance_sampling_ratio/min": 0.002187495119869709,
+      "sampling/sampling_logp_difference/max": 6.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.017413027584552765,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.46246323235755e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.46246323235755e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15318.0,
+      "completions/mean_length": 7226.515625,
+      "completions/mean_terminated_length": 7006.736328125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9573849961161613,
+      "epoch": 0.31646734130634774,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005092279519885778,
+      "learning_rate": 1e-05,
+      "loss": 0.1102,
+      "num_tokens": 300447903.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999373555183411,
+      "sampling/importance_sampling_ratio/min": 0.000627054600045085,
+      "sampling/sampling_logp_difference/max": 7.374476909637451,
+      "sampling/sampling_logp_difference/mean": 0.021570835262537003,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 5.487269390869187e-06,
+      "clip_ratio/high_mean": 1.3718173477172968e-06,
+      "clip_ratio/low_mean": 4.7280102080549113e-05,
+      "clip_ratio/low_min": 1.0166083029616857e-05,
+      "clip_ratio/region_mean": 4.865191931457957e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14967.0,
+      "completions/mean_length": 5755.171875,
+      "completions/mean_terminated_length": 5323.10546875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.8482184633612633,
+      "epoch": 0.3173873045078197,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005033228080719709,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "num_tokens": 301206021.0,
+      "reward": 0.390625,
+      "reward_std": 0.3424547016620636,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999947547912598,
+      "sampling/importance_sampling_ratio/min": 0.0014573346124961972,
+      "sampling/sampling_logp_difference/max": 6.531146049499512,
+      "sampling/sampling_logp_difference/mean": 0.018870476633310318,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 5.421346941147931e-06,
+      "clip_ratio/high_mean": 1.3553367352869827e-06,
+      "clip_ratio/low_mean": 1.6510994441887306e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.786633117717429e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15509.0,
+      "completions/mean_length": 7098.7265625,
+      "completions/mean_terminated_length": 6875.88037109375,
+      "completions/min_length": 947.0,
+      "completions/min_terminated_length": 947.0,
+      "entropy": 0.87320177257061,
+      "epoch": 0.31830726770929163,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.007659573573619127,
+      "learning_rate": 1e-05,
+      "loss": 0.0707,
+      "num_tokens": 302133890.0,
+      "reward": 0.421875,
+      "reward_std": 0.23410367965698242,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000004768371582,
+      "sampling/importance_sampling_ratio/min": 0.0012466582702472806,
+      "sampling/sampling_logp_difference/max": 6.687288761138916,
+      "sampling/sampling_logp_difference/mean": 0.019994346424937248,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 1.1556229310372146e-05,
+      "clip_ratio/high_mean": 2.8890573275930365e-06,
+      "clip_ratio/low_mean": 3.8744643916288624e-05,
+      "clip_ratio/low_min": 6.108287834649673e-06,
+      "clip_ratio/region_mean": 4.1633702039689524e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16139.0,
+      "completions/mean_length": 6399.96875,
+      "completions/mean_terminated_length": 6077.90283203125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.9481896534562111,
+      "epoch": 0.31922723091076355,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014135175151750445,
+      "learning_rate": 1e-05,
+      "loss": 0.0487,
+      "num_tokens": 302972566.0,
+      "reward": 0.4140625,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0025698256213217974,
+      "sampling/sampling_logp_difference/max": 5.963917255401611,
+      "sampling/sampling_logp_difference/mean": 0.02073008380830288,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 6.59491388432798e-06,
+      "clip_ratio/high_mean": 2.545892130001448e-06,
+      "clip_ratio/low_mean": 4.620846755187813e-05,
+      "clip_ratio/low_min": 6.243132702365983e-06,
+      "clip_ratio/region_mean": 4.875435956819274e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 7298.078125,
+      "completions/mean_terminated_length": 7226.53564453125,
+      "completions/min_length": 1009.0,
+      "completions/min_terminated_length": 1009.0,
+      "entropy": 0.8719206526875496,
+      "epoch": 0.3201471941122355,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027898226398974657,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 303925976.0,
+      "reward": 0.484375,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.005236432887613773,
+      "sampling/sampling_logp_difference/max": 5.252114772796631,
+      "sampling/sampling_logp_difference/mean": 0.020944103598594666,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 1.052124343914329e-05,
+      "clip_ratio/high_mean": 2.6303108597858227e-06,
+      "clip_ratio/low_mean": 2.010384196182713e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.273415248055244e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14980.0,
+      "completions/mean_length": 5667.0390625,
+      "completions/mean_terminated_length": 5496.9287109375,
+      "completions/min_length": 974.0,
+      "completions/min_terminated_length": 974.0,
+      "entropy": 0.8791451379656792,
+      "epoch": 0.32106715731370744,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0012764945859089494,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 304675157.0,
+      "reward": 0.390625,
+      "reward_std": 0.17965976893901825,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000383853912354,
+      "sampling/importance_sampling_ratio/min": 5.054428584116977e-06,
+      "sampling/sampling_logp_difference/max": 12.195245742797852,
+      "sampling/sampling_logp_difference/mean": 0.018928447738289833,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 9.578045592206763e-06,
+      "clip_ratio/high_mean": 2.3945113980516908e-06,
+      "clip_ratio/low_mean": 3.1114799753595435e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.350931149270764e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15354.0,
+      "completions/max_terminated_length": 15354.0,
+      "completions/mean_length": 5874.4453125,
+      "completions/mean_terminated_length": 5874.4453125,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.9577538818120956,
+      "epoch": 0.3219871205151794,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.00509974779561162,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 305447038.0,
+      "reward": 0.515625,
+      "reward_std": 0.24777325987815857,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999423027038574,
+      "sampling/importance_sampling_ratio/min": 0.004791648127138615,
+      "sampling/sampling_logp_difference/max": 5.340880870819092,
+      "sampling/sampling_logp_difference/mean": 0.02114470861852169,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 1.0903062275247066e-05,
+      "clip_ratio/high_mean": 2.7257655688117666e-06,
+      "clip_ratio/low_mean": 4.784364205079328e-05,
+      "clip_ratio/low_min": 3.861600362142781e-06,
+      "clip_ratio/region_mean": 5.056940744907479e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15670.0,
+      "completions/mean_length": 6197.5703125,
+      "completions/mean_terminated_length": 6035.88134765625,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.8665244281291962,
+      "epoch": 0.32290708371665133,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0030849494505673647,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 306258023.0,
+      "reward": 0.515625,
+      "reward_std": 0.3748064339160919,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998056888580322,
+      "sampling/importance_sampling_ratio/min": 0.000830297009088099,
+      "sampling/sampling_logp_difference/max": 7.093727111816406,
+      "sampling/sampling_logp_difference/mean": 0.021017421036958694,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 1.4299712574938894e-05,
+      "clip_ratio/high_mean": 4.3520980170796975e-06,
+      "clip_ratio/low_mean": 6.213493452378316e-05,
+      "clip_ratio/low_min": 1.0056635801447555e-05,
+      "clip_ratio/region_mean": 6.648703174505499e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16276.0,
+      "completions/mean_length": 7522.578125,
+      "completions/mean_terminated_length": 7381.9208984375,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.8185881152749062,
+      "epoch": 0.32382704691812325,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002946985885500908,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 307240305.0,
+      "reward": 0.3125,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.005127199459820986,
+      "sampling/sampling_logp_difference/max": 5.273195743560791,
+      "sampling/sampling_logp_difference/mean": 0.01965932548046112,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 1.693051035545068e-05,
+      "clip_ratio/high_mean": 5.08456730585749e-06,
+      "clip_ratio/low_mean": 4.2052345861520735e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.713691282631771e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14090.0,
+      "completions/mean_length": 6403.2265625,
+      "completions/mean_terminated_length": 6163.6884765625,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "entropy": 0.8359840363264084,
+      "epoch": 0.3247470101195952,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031181599479168653,
+      "learning_rate": 1e-05,
+      "loss": 0.072,
+      "num_tokens": 308079318.0,
+      "reward": 0.5,
+      "reward_std": 0.27145031094551086,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999215602874756,
+      "sampling/importance_sampling_ratio/min": 6.73715621815063e-05,
+      "sampling/sampling_logp_difference/max": 9.605287551879883,
+      "sampling/sampling_logp_difference/mean": 0.01963040418922901,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 1.3988919135954347e-05,
+      "clip_ratio/high_mean": 3.497229783988587e-06,
+      "clip_ratio/low_mean": 6.722658486069122e-05,
+      "clip_ratio/low_min": 1.858519090092159e-05,
+      "clip_ratio/region_mean": 7.072381458783639e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16148.0,
+      "completions/mean_length": 7954.03125,
+      "completions/mean_terminated_length": 7751.71240234375,
+      "completions/min_length": 632.0,
+      "completions/min_terminated_length": 632.0,
+      "entropy": 0.905990719795227,
+      "epoch": 0.32566697332106714,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002656223252415657,
+      "learning_rate": 1e-05,
+      "loss": 0.1022,
+      "num_tokens": 309117770.0,
+      "reward": 0.3828125,
+      "reward_std": 0.321655809879303,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999536275863647,
+      "sampling/importance_sampling_ratio/min": 0.0003354826185386628,
+      "sampling/sampling_logp_difference/max": 7.999940395355225,
+      "sampling/sampling_logp_difference/mean": 0.020741507411003113,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 1.7610595023143105e-05,
+      "clip_ratio/high_mean": 4.402648755785776e-06,
+      "clip_ratio/low_mean": 4.337988764291367e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.778253651238629e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16272.0,
+      "completions/mean_length": 6630.09375,
+      "completions/mean_terminated_length": 6315.45166015625,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.870736837387085,
+      "epoch": 0.3265869365225391,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0060529084876179695,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 309988894.0,
+      "reward": 0.515625,
+      "reward_std": 0.2790592312812805,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998822212219238,
+      "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05,
+      "sampling/sampling_logp_difference/max": 10.716434478759766,
+      "sampling/sampling_logp_difference/mean": 0.02060208097100258,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 1.0448093235027045e-05,
+      "clip_ratio/high_mean": 2.6120233087567613e-06,
+      "clip_ratio/low_mean": 3.1030769946482906e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.364279325523967e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15920.0,
+      "completions/max_terminated_length": 15920.0,
+      "completions/mean_length": 6679.6171875,
+      "completions/mean_terminated_length": 6679.6171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9812518879771233,
+      "epoch": 0.32750689972401104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00400698184967041,
+      "learning_rate": 1e-05,
+      "loss": 0.0605,
+      "num_tokens": 310864013.0,
+      "reward": 0.421875,
+      "reward_std": 0.3295465111732483,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999049305915833,
+      "sampling/importance_sampling_ratio/min": 0.0020593837834894657,
+      "sampling/sampling_logp_difference/max": 6.1853485107421875,
+      "sampling/sampling_logp_difference/mean": 0.02098071575164795,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 2.124982574969181e-05,
+      "clip_ratio/high_mean": 7.736592579021817e-06,
+      "clip_ratio/low_mean": 2.900951585615985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.674610888992902e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14541.0,
+      "completions/mean_length": 5523.796875,
+      "completions/mean_terminated_length": 5173.4677734375,
+      "completions/min_length": 633.0,
+      "completions/min_terminated_length": 633.0,
+      "entropy": 0.9120645374059677,
+      "epoch": 0.32842686292548295,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005929585546255112,
+      "learning_rate": 1e-05,
+      "loss": 0.0362,
+      "num_tokens": 311589987.0,
+      "reward": 0.4765625,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998446702957153,
+      "sampling/importance_sampling_ratio/min": 0.0010661041596904397,
+      "sampling/sampling_logp_difference/max": 6.843744277954102,
+      "sampling/sampling_logp_difference/mean": 0.019948206841945648,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 2.4486997745043482e-05,
+      "clip_ratio/high_mean": 8.219769085826556e-06,
+      "clip_ratio/low_mean": 5.346400575945154e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.168377467474784e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15401.0,
+      "completions/mean_length": 6361.3671875,
+      "completions/mean_terminated_length": 6282.44873046875,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.8044678047299385,
+      "epoch": 0.32934682612695493,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.006622390355914831,
+      "learning_rate": 1e-05,
+      "loss": 0.1023,
+      "num_tokens": 312424034.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3724474310874939,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000219345092773,
+      "sampling/importance_sampling_ratio/min": 0.0003157092141918838,
+      "sampling/sampling_logp_difference/max": 8.060688972473145,
+      "sampling/sampling_logp_difference/mean": 0.018907658755779266,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 1.0407376748844399e-05,
+      "clip_ratio/high_mean": 2.6018441872110998e-06,
+      "clip_ratio/low_mean": 5.925514369664597e-05,
+      "clip_ratio/low_min": 1.3324347946763737e-05,
+      "clip_ratio/region_mean": 6.185698703120579e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15883.0,
+      "completions/mean_length": 7109.0,
+      "completions/mean_terminated_length": 7035.96826171875,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.9167275875806808,
+      "epoch": 0.33026678932842685,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004639944992959499,
+      "learning_rate": 1e-05,
+      "loss": 0.0861,
+      "num_tokens": 313353346.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3826971650123596,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999389052391052,
+      "sampling/importance_sampling_ratio/min": 0.0019070414127781987,
+      "sampling/sampling_logp_difference/max": 6.262202262878418,
+      "sampling/sampling_logp_difference/mean": 0.02155841514468193,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 3.959046694035351e-05,
+      "clip_ratio/high_mean": 1.0912523691786191e-05,
+      "clip_ratio/low_mean": 3.3944450819944905e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.485697365907981e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15795.0,
+      "completions/mean_length": 6314.2734375,
+      "completions/mean_terminated_length": 6072.60009765625,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.8780038207769394,
+      "epoch": 0.3311867525298988,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.007643720600754023,
+      "learning_rate": 1e-05,
+      "loss": 0.0873,
+      "num_tokens": 314180717.0,
+      "reward": 0.4609375,
+      "reward_std": 0.28117600083351135,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999802112579346,
+      "sampling/importance_sampling_ratio/min": 0.021285315975546837,
+      "sampling/sampling_logp_difference/max": 3.8497378826141357,
+      "sampling/sampling_logp_difference/mean": 0.01964358240365982,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 3.065382111344661e-05,
+      "clip_ratio/high_mean": 9.187473835936544e-06,
+      "clip_ratio/low_mean": 4.137891801292426e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.056639065514901e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6718.2265625,
+      "completions/mean_terminated_length": 6486.24853515625,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.8326799497008324,
+      "epoch": 0.33210671573137074,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0050973957404494286,
+      "learning_rate": 1e-05,
+      "loss": 0.0109,
+      "num_tokens": 315060842.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3521803915500641,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000014066696167,
+      "sampling/importance_sampling_ratio/min": 0.0009130688849836588,
+      "sampling/sampling_logp_difference/max": 6.998699188232422,
+      "sampling/sampling_logp_difference/mean": 0.019501537084579468,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 8.624853762739804e-06,
+      "clip_ratio/high_mean": 2.156213440684951e-06,
+      "clip_ratio/low_mean": 1.8797969062234188e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0954182048171788e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16128.0,
+      "completions/mean_length": 8666.8359375,
+      "completions/mean_terminated_length": 7941.291015625,
+      "completions/min_length": 565.0,
+      "completions/min_terminated_length": 565.0,
+      "entropy": 0.9526705741882324,
+      "epoch": 0.3330266789328427,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019092690199613571,
+      "learning_rate": 1e-05,
+      "loss": 0.036,
+      "num_tokens": 316190325.0,
+      "reward": 0.234375,
+      "reward_std": 0.2022808939218521,
+      "rewards/accuracy_reward/mean": 0.234375,
+      "rewards/accuracy_reward/std": 0.42527204751968384,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999814629554749,
+      "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05,
+      "sampling/sampling_logp_difference/max": 10.249995231628418,
+      "sampling/sampling_logp_difference/mean": 0.02051631174981594,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 2.147400391550036e-05,
+      "clip_ratio/high_mean": 6.434908300434472e-06,
+      "clip_ratio/low_mean": 3.521234066283796e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.164724816746457e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15164.0,
+      "completions/mean_length": 7661.8203125,
+      "completions/mean_terminated_length": 7002.16015625,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 0.8322782590985298,
+      "epoch": 0.33394664213431463,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0019530428107827902,
+      "learning_rate": 1e-05,
+      "loss": 0.0729,
+      "num_tokens": 317191878.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21382391452789307,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 8.546619210392237e-05,
+      "sampling/sampling_logp_difference/max": 9.367389678955078,
+      "sampling/sampling_logp_difference/mean": 0.019894573837518692,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 1.9436202364886412e-05,
+      "clip_ratio/high_mean": 6.089704697842535e-06,
+      "clip_ratio/low_mean": 4.2698405422925134e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.878810955233348e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15934.0,
+      "completions/mean_length": 7024.859375,
+      "completions/mean_terminated_length": 6800.240234375,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.794853538274765,
+      "epoch": 0.33486660533578655,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0031784537713974714,
+      "learning_rate": 1e-05,
+      "loss": 0.0391,
+      "num_tokens": 318109004.0,
+      "reward": 0.4921875,
+      "reward_std": 0.31800347566604614,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999352693557739,
+      "sampling/importance_sampling_ratio/min": 0.0002962362195830792,
+      "sampling/sampling_logp_difference/max": 8.124353408813477,
+      "sampling/sampling_logp_difference/mean": 0.018519200384616852,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 4.127455667912727e-06,
+      "clip_ratio/high_mean": 1.0318639169781818e-06,
+      "clip_ratio/low_mean": 4.342453667049995e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.445640047379129e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15624.0,
+      "completions/mean_length": 7282.1796875,
+      "completions/mean_terminated_length": 6912.1865234375,
+      "completions/min_length": 870.0,
+      "completions/min_terminated_length": 870.0,
+      "entropy": 0.904067650437355,
+      "epoch": 0.3357865685372585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005080109462141991,
+      "learning_rate": 1e-05,
+      "loss": 0.041,
+      "num_tokens": 319059075.0,
+      "reward": 0.4140625,
+      "reward_std": 0.26539456844329834,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000062108039856,
+      "sampling/importance_sampling_ratio/min": 0.1194523349404335,
+      "sampling/sampling_logp_difference/max": 6.136754989624023,
+      "sampling/sampling_logp_difference/mean": 0.019978653639554977,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.608940076243016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.608940076243016e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15625.0,
+      "completions/mean_length": 7131.5234375,
+      "completions/mean_terminated_length": 6596.255859375,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.8849587142467499,
+      "epoch": 0.33670653173873044,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0022667953744530678,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "num_tokens": 319990046.0,
+      "reward": 0.46875,
+      "reward_std": 0.30221715569496155,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0370909757912159,
+      "sampling/sampling_logp_difference/max": 3.294381618499756,
+      "sampling/sampling_logp_difference/mean": 0.02037571743130684,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 1.5356635913121863e-05,
+      "clip_ratio/high_mean": 3.839158978280466e-06,
+      "clip_ratio/low_mean": 3.4950805911648786e-05,
+      "clip_ratio/low_min": 4.876336333836662e-06,
+      "clip_ratio/region_mean": 3.8789965287833184e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16205.0,
+      "completions/mean_length": 6655.4453125,
+      "completions/mean_terminated_length": 6578.84228515625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.7417122721672058,
+      "epoch": 0.3376264949402024,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00216497085057199,
+      "learning_rate": 1e-05,
+      "loss": 0.0681,
+      "num_tokens": 320860135.0,
+      "reward": 0.5625,
+      "reward_std": 0.3369230031967163,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 0.0005190494703128934,
+      "sampling/sampling_logp_difference/max": 7.563511371612549,
+      "sampling/sampling_logp_difference/mean": 0.01771342009305954,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 1.7605634639039636e-05,
+      "clip_ratio/high_mean": 5.297029474604642e-06,
+      "clip_ratio/low_mean": 5.688933060810086e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.218636053745286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15849.0,
+      "completions/mean_length": 7077.1640625,
+      "completions/mean_terminated_length": 6619.45068359375,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "entropy": 0.8749325424432755,
+      "epoch": 0.33854645814167433,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0028338562697172165,
+      "learning_rate": 1e-05,
+      "loss": 0.0643,
+      "num_tokens": 321783852.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2120065838098526,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998220205307007,
+      "sampling/importance_sampling_ratio/min": 7.83290306571871e-06,
+      "sampling/sampling_logp_difference/max": 11.757177352905273,
+      "sampling/sampling_logp_difference/mean": 0.020299233496189117,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 7.301828190975357e-06,
+      "clip_ratio/high_mean": 1.8254570477438392e-06,
+      "clip_ratio/low_mean": 5.158197632226802e-05,
+      "clip_ratio/low_min": 3.735804057214409e-06,
+      "clip_ratio/region_mean": 5.340743223314348e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15329.0,
+      "completions/mean_length": 6034.296875,
+      "completions/mean_terminated_length": 5525.294921875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.80014718323946,
+      "epoch": 0.33946642134314625,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022897711023688316,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 322572882.0,
+      "reward": 0.40625,
+      "reward_std": 0.2756394147872925,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999347925186157,
+      "sampling/importance_sampling_ratio/min": 0.0004105660773348063,
+      "sampling/sampling_logp_difference/max": 7.7979736328125,
+      "sampling/sampling_logp_difference/mean": 0.01858348958194256,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 9.364057859784225e-06,
+      "clip_ratio/high_mean": 3.351393047523743e-06,
+      "clip_ratio/low_mean": 4.186752630630508e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5218919240141986e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15948.0,
+      "completions/mean_length": 8172.109375,
+      "completions/mean_terminated_length": 7838.29248046875,
+      "completions/min_length": 733.0,
+      "completions/min_terminated_length": 733.0,
+      "entropy": 0.8732693120837212,
+      "epoch": 0.3403863845446182,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003263789461925626,
+      "learning_rate": 1e-05,
+      "loss": 0.0356,
+      "num_tokens": 323640904.0,
+      "reward": 0.2890625,
+      "reward_std": 0.3237774670124054,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999354481697083,
+      "sampling/importance_sampling_ratio/min": 9.27252222027164e-06,
+      "sampling/sampling_logp_difference/max": 11.588455200195312,
+      "sampling/sampling_logp_difference/mean": 0.0208889190107584,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 2.0998899799451465e-05,
+      "clip_ratio/high_mean": 6.692962131182867e-06,
+      "clip_ratio/low_mean": 4.261424010110204e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.930720297124935e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16228.0,
+      "completions/mean_length": 7699.203125,
+      "completions/mean_terminated_length": 7419.04833984375,
+      "completions/min_length": 1225.0,
+      "completions/min_terminated_length": 1225.0,
+      "entropy": 0.8296505436301231,
+      "epoch": 0.34130634774609014,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0042716520838439465,
+      "learning_rate": 1e-05,
+      "loss": 0.0937,
+      "num_tokens": 324643858.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3090519607067108,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999874234199524,
+      "sampling/importance_sampling_ratio/min": 0.00022192654432728887,
+      "sampling/sampling_logp_difference/max": 8.413164138793945,
+      "sampling/sampling_logp_difference/mean": 0.018926654011011124,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 7.061349151626928e-06,
+      "clip_ratio/high_mean": 1.765337287906732e-06,
+      "clip_ratio/low_mean": 4.5005243464402156e-05,
+      "clip_ratio/low_min": 3.861838649754645e-06,
+      "clip_ratio/region_mean": 4.6770580411248375e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16364.0,
+      "completions/max_terminated_length": 16364.0,
+      "completions/mean_length": 7450.1640625,
+      "completions/mean_terminated_length": 7450.1640625,
+      "completions/min_length": 910.0,
+      "completions/min_terminated_length": 910.0,
+      "entropy": 1.0400195196270943,
+      "epoch": 0.3422263109475621,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0033558050636202097,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 325617687.0,
+      "reward": 0.2578125,
+      "reward_std": 0.27222445607185364,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999459385871887,
+      "sampling/importance_sampling_ratio/min": 0.039920732378959656,
+      "sampling/sampling_logp_difference/max": 3.2208595275878906,
+      "sampling/sampling_logp_difference/mean": 0.02249298244714737,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 1.3147802746971138e-05,
+      "clip_ratio/high_mean": 3.2869506867427845e-06,
+      "clip_ratio/low_mean": 2.4451034505545977e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7737984851228248e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15342.0,
+      "completions/mean_length": 6799.0703125,
+      "completions/mean_terminated_length": 6723.5986328125,
+      "completions/min_length": 1708.0,
+      "completions/min_terminated_length": 1708.0,
+      "entropy": 0.9737623482942581,
+      "epoch": 0.34314627414903404,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005797459278255701,
+      "learning_rate": 1e-05,
+      "loss": 0.0476,
+      "num_tokens": 326508384.0,
+      "reward": 0.3125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999321699142456,
+      "sampling/importance_sampling_ratio/min": 7.535634836131067e-07,
+      "sampling/sampling_logp_difference/max": 14.0984525680542,
+      "sampling/sampling_logp_difference/mean": 0.021543748676776886,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 3.3594023989280686e-06,
+      "clip_ratio/high_mean": 8.398505997320171e-07,
+      "clip_ratio/low_mean": 2.3457610382138228e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4297460981870245e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16102.0,
+      "completions/mean_length": 7034.3671875,
+      "completions/mean_terminated_length": 6654.30078125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8749603256583214,
+      "epoch": 0.34406623735050595,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002258980879560113,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 327426407.0,
+      "reward": 0.4609375,
+      "reward_std": 0.19674429297447205,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999661445617676,
+      "sampling/importance_sampling_ratio/min": 0.008719252422451973,
+      "sampling/sampling_logp_difference/max": 4.742221832275391,
+      "sampling/sampling_logp_difference/mean": 0.01997346058487892,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 2.823375348270929e-05,
+      "clip_ratio/high_mean": 7.058438370677322e-06,
+      "clip_ratio/low_mean": 4.9395109726901865e-05,
+      "clip_ratio/low_min": 1.636556044104509e-05,
+      "clip_ratio/region_mean": 5.6453548268109444e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15240.0,
+      "completions/mean_length": 6623.078125,
+      "completions/mean_terminated_length": 6388.81640625,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.858784057199955,
+      "epoch": 0.34498620055197793,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002420129720121622,
+      "learning_rate": 1e-05,
+      "loss": 0.076,
+      "num_tokens": 328292985.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3077537417411804,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998596906661987,
+      "sampling/importance_sampling_ratio/min": 0.00014900295354891568,
+      "sampling/sampling_logp_difference/max": 8.811544418334961,
+      "sampling/sampling_logp_difference/mean": 0.019645996391773224,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 1.8078507309837732e-05,
+      "clip_ratio/high_mean": 6.468551191574079e-06,
+      "clip_ratio/low_mean": 4.051302585139638e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.698157727034413e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15229.0,
+      "completions/mean_length": 5902.4765625,
+      "completions/mean_terminated_length": 5564.36279296875,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.904740035533905,
+      "epoch": 0.34590616375344985,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004107976797968149,
+      "learning_rate": 1e-05,
+      "loss": 0.0824,
+      "num_tokens": 329067006.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3945493996143341,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999526143074036,
+      "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05,
+      "sampling/sampling_logp_difference/max": 11.37439250946045,
+      "sampling/sampling_logp_difference/mean": 0.019582755863666534,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 2.553658168835682e-05,
+      "clip_ratio/high_mean": 7.276365181496658e-06,
+      "clip_ratio/low_mean": 1.7552573126522475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.482893796695862e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14514.0,
+      "completions/mean_length": 6425.6015625,
+      "completions/mean_terminated_length": 6267.5322265625,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.964553713798523,
+      "epoch": 0.3468261269549218,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003208522219210863,
+      "learning_rate": 1e-05,
+      "loss": 0.0164,
+      "num_tokens": 329910691.0,
+      "reward": 0.359375,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999419450759888,
+      "sampling/importance_sampling_ratio/min": 0.00137569778598845,
+      "sampling/sampling_logp_difference/max": 6.588794231414795,
+      "sampling/sampling_logp_difference/mean": 0.021154657006263733,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 6.8712420215888415e-06,
+      "clip_ratio/high_mean": 1.7178105053972104e-06,
+      "clip_ratio/low_mean": 4.0991827404468495e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2709637853022286e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15797.0,
+      "completions/mean_length": 8006.4453125,
+      "completions/mean_terminated_length": 7594.43408203125,
+      "completions/min_length": 1235.0,
+      "completions/min_terminated_length": 1235.0,
+      "entropy": 0.8980336412787437,
+      "epoch": 0.34774609015639374,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002898421371355653,
+      "learning_rate": 1e-05,
+      "loss": 0.0815,
+      "num_tokens": 330956332.0,
+      "reward": 0.4296875,
+      "reward_std": 0.20175684988498688,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998818635940552,
+      "sampling/importance_sampling_ratio/min": 9.378339746035635e-05,
+      "sampling/sampling_logp_difference/max": 9.27452278137207,
+      "sampling/sampling_logp_difference/mean": 0.021021340042352676,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.2689344689297286e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2689344689297286e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15484.0,
+      "completions/max_terminated_length": 15484.0,
+      "completions/mean_length": 7068.828125,
+      "completions/mean_terminated_length": 7068.828125,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.9865007549524307,
+      "epoch": 0.3486660533578657,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0037063576746731997,
+      "learning_rate": 1e-05,
+      "loss": 0.0313,
+      "num_tokens": 331880918.0,
+      "reward": 0.3203125,
+      "reward_std": 0.17859892547130585,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 0.0001819290773710236,
+      "sampling/sampling_logp_difference/max": 8.611893653869629,
+      "sampling/sampling_logp_difference/mean": 0.02072504535317421,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 5.845633268108941e-06,
+      "clip_ratio/high_mean": 1.4614083170272352e-06,
+      "clip_ratio/low_mean": 3.207486906831036e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.353627721480734e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16281.0,
+      "completions/mean_length": 7379.390625,
+      "completions/mean_terminated_length": 7236.4609375,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.8977236375212669,
+      "epoch": 0.34958601655933763,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001972826896235347,
+      "learning_rate": 1e-05,
+      "loss": 0.0228,
+      "num_tokens": 332849112.0,
+      "reward": 0.4140625,
+      "reward_std": 0.28247418999671936,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999925971031189,
+      "sampling/importance_sampling_ratio/min": 2.820451663865242e-05,
+      "sampling/sampling_logp_difference/max": 10.476028442382812,
+      "sampling/sampling_logp_difference/mean": 0.019411223009228706,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 4.875385002378607e-06,
+      "clip_ratio/high_mean": 1.2188462505946518e-06,
+      "clip_ratio/low_mean": 2.3530714997832547e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.47495612484272e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15517.0,
+      "completions/mean_length": 6867.9609375,
+      "completions/mean_terminated_length": 6793.03125,
+      "completions/min_length": 760.0,
+      "completions/min_terminated_length": 760.0,
+      "entropy": 0.9244343340396881,
+      "epoch": 0.35050597976080955,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.006926023401319981,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 333746179.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1433562934398651,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999299645423889,
+      "sampling/importance_sampling_ratio/min": 0.0003875594411510974,
+      "sampling/sampling_logp_difference/max": 7.8556413650512695,
+      "sampling/sampling_logp_difference/mean": 0.020311862230300903,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 1.5651628245905158e-05,
+      "clip_ratio/high_mean": 4.836261211949022e-06,
+      "clip_ratio/low_mean": 5.268017821435933e-05,
+      "clip_ratio/low_min": 3.950945028918795e-06,
+      "clip_ratio/region_mean": 5.751643902840442e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 7525.375,
+      "completions/mean_terminated_length": 6855.3955078125,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.9207312315702438,
+      "epoch": 0.3514259429622815,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0047226278111338615,
+      "learning_rate": 1e-05,
+      "loss": 0.0808,
+      "num_tokens": 334731027.0,
+      "reward": 0.3359375,
+      "reward_std": 0.3353874683380127,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999615550041199,
+      "sampling/importance_sampling_ratio/min": 0.00029753465787507594,
+      "sampling/sampling_logp_difference/max": 8.119979858398438,
+      "sampling/sampling_logp_difference/mean": 0.021496692672371864,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 3.815379886873416e-05,
+      "clip_ratio/high_mean": 9.53844971718354e-06,
+      "clip_ratio/low_mean": 4.519663821156428e-05,
+      "clip_ratio/low_min": 2.775434040813707e-06,
+      "clip_ratio/region_mean": 5.473508826980833e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16251.0,
+      "completions/mean_length": 6841.0625,
+      "completions/mean_terminated_length": 6453.13818359375,
+      "completions/min_length": 689.0,
+      "completions/min_terminated_length": 689.0,
+      "entropy": 0.8979457840323448,
+      "epoch": 0.35234590616375344,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004971448332071304,
+      "learning_rate": 1e-05,
+      "loss": 0.0126,
+      "num_tokens": 335631243.0,
+      "reward": 0.390625,
+      "reward_std": 0.2596156895160675,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999934196472168,
+      "sampling/importance_sampling_ratio/min": 9.655764188210014e-06,
+      "sampling/sampling_logp_difference/max": 11.547955513000488,
+      "sampling/sampling_logp_difference/mean": 0.020256079733371735,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 4.162365712545579e-06,
+      "clip_ratio/high_mean": 1.0405914281363948e-06,
+      "clip_ratio/low_mean": 3.1563491688757495e-05,
+      "clip_ratio/low_min": 3.1228139505401487e-06,
+      "clip_ratio/region_mean": 3.260408311689389e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15060.0,
+      "completions/mean_length": 6919.8046875,
+      "completions/mean_terminated_length": 6454.35205078125,
+      "completions/min_length": 896.0,
+      "completions/min_terminated_length": 896.0,
+      "entropy": 0.9241961911320686,
+      "epoch": 0.3532658693652254,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0038604787550866604,
+      "learning_rate": 1e-05,
+      "loss": 0.0262,
+      "num_tokens": 336537162.0,
+      "reward": 0.375,
+      "reward_std": 0.2777610421180725,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998080730438232,
+      "sampling/importance_sampling_ratio/min": 0.0009118975722230971,
+      "sampling/sampling_logp_difference/max": 6.999982833862305,
+      "sampling/sampling_logp_difference/mean": 0.02030865103006363,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 6.5182248363271356e-06,
+      "clip_ratio/high_mean": 1.6295562090817839e-06,
+      "clip_ratio/low_mean": 4.3847362121596234e-05,
+      "clip_ratio/low_min": 6.294533704931382e-06,
+      "clip_ratio/region_mean": 4.547691833067802e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15692.0,
+      "completions/mean_length": 7679.390625,
+      "completions/mean_terminated_length": 7099.08349609375,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "entropy": 1.0165777206420898,
+      "epoch": 0.35418583256669733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004624314606189728,
+      "learning_rate": 1e-05,
+      "loss": 0.0849,
+      "num_tokens": 337542492.0,
+      "reward": 0.3046875,
+      "reward_std": 0.2517249882221222,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999251961708069,
+      "sampling/importance_sampling_ratio/min": 5.83546279813163e-05,
+      "sampling/sampling_logp_difference/max": 9.748971939086914,
+      "sampling/sampling_logp_difference/mean": 0.02206476218998432,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 6.00499606662197e-06,
+      "clip_ratio/high_mean": 1.5012490166554926e-06,
+      "clip_ratio/low_mean": 3.392923713363416e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.543048615028965e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15819.0,
+      "completions/mean_length": 5957.5859375,
+      "completions/mean_terminated_length": 5792.08740234375,
+      "completions/min_length": 1705.0,
+      "completions/min_terminated_length": 1705.0,
+      "entropy": 0.7705951780080795,
+      "epoch": 0.35510579576816925,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021966886706650257,
+      "learning_rate": 1e-05,
+      "loss": 0.0789,
+      "num_tokens": 338324279.0,
+      "reward": 0.53125,
+      "reward_std": 0.22962790727615356,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999998927116394,
+      "sampling/importance_sampling_ratio/min": 0.0008041196851991117,
+      "sampling/sampling_logp_difference/max": 7.125762462615967,
+      "sampling/sampling_logp_difference/mean": 0.01804077997803688,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 1.5711350215497077e-05,
+      "clip_ratio/high_mean": 3.927837553874269e-06,
+      "clip_ratio/low_mean": 5.276240381135722e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.669024130838807e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7269.8046875,
+      "completions/mean_terminated_length": 7198.03955078125,
+      "completions/min_length": 892.0,
+      "completions/min_terminated_length": 892.0,
+      "entropy": 1.0025205165147781,
+      "epoch": 0.3560257589696412,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.001694107661023736,
+      "learning_rate": 1e-05,
+      "loss": 0.134,
+      "num_tokens": 339274662.0,
+      "reward": 0.3359375,
+      "reward_std": 0.30487072467803955,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999039769172668,
+      "sampling/importance_sampling_ratio/min": 0.0015677008777856827,
+      "sampling/sampling_logp_difference/max": 6.4581451416015625,
+      "sampling/sampling_logp_difference/mean": 0.021742526441812515,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 7.005848829066963e-06,
+      "clip_ratio/high_mean": 1.7514622072667407e-06,
+      "clip_ratio/low_mean": 5.100632029098051e-05,
+      "clip_ratio/low_min": 8.934973720897688e-06,
+      "clip_ratio/region_mean": 5.275778244140383e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15755.0,
+      "completions/mean_length": 7643.8359375,
+      "completions/mean_terminated_length": 7288.54443359375,
+      "completions/min_length": 1061.0,
+      "completions/min_terminated_length": 1061.0,
+      "entropy": 0.7936615869402885,
+      "epoch": 0.35694572217111314,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004587972536683083,
+      "learning_rate": 1e-05,
+      "loss": 0.0691,
+      "num_tokens": 340272689.0,
+      "reward": 0.5078125,
+      "reward_std": 0.35324612259864807,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999613761901855,
+      "sampling/importance_sampling_ratio/min": 0.0007390327518805861,
+      "sampling/sampling_logp_difference/max": 7.210168361663818,
+      "sampling/sampling_logp_difference/mean": 0.01862112432718277,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 1.0522736374696251e-05,
+      "clip_ratio/high_mean": 2.6306840936740628e-06,
+      "clip_ratio/low_mean": 2.139122614153166e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4021910121518886e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14401.0,
+      "completions/mean_length": 7068.734375,
+      "completions/mean_terminated_length": 6610.60595703125,
+      "completions/min_length": 775.0,
+      "completions/min_terminated_length": 775.0,
+      "entropy": 0.8858344480395317,
+      "epoch": 0.3578656853725851,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00245783943682909,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "num_tokens": 341195599.0,
+      "reward": 0.4609375,
+      "reward_std": 0.21594557166099548,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999957263469696,
+      "sampling/importance_sampling_ratio/min": 1.526316918898374e-05,
+      "sampling/sampling_logp_difference/max": 11.090067863464355,
+      "sampling/sampling_logp_difference/mean": 0.019989900290966034,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 5.272259386401856e-06,
+      "clip_ratio/high_mean": 1.318064846600464e-06,
+      "clip_ratio/low_mean": 2.2939096254503966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4257160987417592e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15788.0,
+      "completions/mean_length": 6093.296875,
+      "completions/mean_terminated_length": 5929.95263671875,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.9640207663178444,
+      "epoch": 0.35878564857405704,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0067657483741641045,
+      "learning_rate": 1e-05,
+      "loss": 0.0181,
+      "num_tokens": 341993565.0,
+      "reward": 0.4453125,
+      "reward_std": 0.12415502220392227,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998992681503296,
+      "sampling/importance_sampling_ratio/min": 0.010459281504154205,
+      "sampling/sampling_logp_difference/max": 4.56026554107666,
+      "sampling/sampling_logp_difference/mean": 0.02037961222231388,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 4.566248594528588e-05,
+      "clip_ratio/low_min": 4.402028480399167e-06,
+      "clip_ratio/region_mean": 4.566248594528588e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16170.0,
+      "completions/max_terminated_length": 16170.0,
+      "completions/mean_length": 7620.09375,
+      "completions/mean_terminated_length": 7620.09375,
+      "completions/min_length": 1076.0,
+      "completions/min_terminated_length": 1076.0,
+      "entropy": 0.9773544892668724,
+      "epoch": 0.35970561177552896,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018817185191437602,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 342990545.0,
+      "reward": 0.3046875,
+      "reward_std": 0.18755048513412476,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999568462371826,
+      "sampling/importance_sampling_ratio/min": 0.0006883936002850533,
+      "sampling/sampling_logp_difference/max": 7.281149864196777,
+      "sampling/sampling_logp_difference/mean": 0.021528441458940506,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 2.6727505428425502e-05,
+      "clip_ratio/high_mean": 7.985045499481203e-06,
+      "clip_ratio/low_mean": 7.762144696243922e-05,
+      "clip_ratio/low_min": 2.4772080450929934e-05,
+      "clip_ratio/region_mean": 8.560649303035461e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15053.0,
+      "completions/mean_length": 6963.984375,
+      "completions/mean_terminated_length": 6737.904296875,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 0.9683744385838509,
+      "epoch": 0.36062557497700093,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0052104732021689415,
+      "learning_rate": 1e-05,
+      "loss": 0.087,
+      "num_tokens": 343898791.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3621976971626282,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999324679374695,
+      "sampling/importance_sampling_ratio/min": 0.010815954767167568,
+      "sampling/sampling_logp_difference/max": 4.526732921600342,
+      "sampling/sampling_logp_difference/mean": 0.021434593945741653,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 1.3545108686230378e-05,
+      "clip_ratio/high_mean": 4.365133804640209e-06,
+      "clip_ratio/low_mean": 2.5377692509209737e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9742826200163108e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15116.0,
+      "completions/mean_length": 6718.5078125,
+      "completions/mean_terminated_length": 6642.4013671875,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.9043834507465363,
+      "epoch": 0.36154553817847285,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005151392426341772,
+      "learning_rate": 1e-05,
+      "loss": 0.0085,
+      "num_tokens": 344779672.0,
+      "reward": 0.4921875,
+      "reward_std": 0.251188188791275,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999840497970581,
+      "sampling/importance_sampling_ratio/min": 0.0024171893019229174,
+      "sampling/sampling_logp_difference/max": 6.025149822235107,
+      "sampling/sampling_logp_difference/mean": 0.0201373603194952,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 1.2263486723895767e-05,
+      "clip_ratio/high_mean": 3.927679188109323e-06,
+      "clip_ratio/low_mean": 2.739263118201052e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.132031042696326e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16342.0,
+      "completions/mean_length": 7044.640625,
+      "completions/mean_terminated_length": 6820.49609375,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "entropy": 0.9017335474491119,
+      "epoch": 0.3624655013799448,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026606651954352856,
+      "learning_rate": 1e-05,
+      "loss": 0.0554,
+      "num_tokens": 345701722.0,
+      "reward": 0.3125,
+      "reward_std": 0.24146249890327454,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05,
+      "sampling/sampling_logp_difference/max": 10.157968521118164,
+      "sampling/sampling_logp_difference/mean": 0.01981864869594574,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 1.026556356009678e-05,
+      "clip_ratio/high_mean": 2.566390890024195e-06,
+      "clip_ratio/low_mean": 4.819571529424138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.0762106297952414e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15476.0,
+      "completions/mean_length": 6031.875,
+      "completions/mean_terminated_length": 5950.3623046875,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.8537683561444283,
+      "epoch": 0.36338546458141674,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003957017324864864,
+      "learning_rate": 1e-05,
+      "loss": 0.0947,
+      "num_tokens": 346492810.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2858940362930298,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999707341194153,
+      "sampling/importance_sampling_ratio/min": 0.0015133036067709327,
+      "sampling/sampling_logp_difference/max": 6.493460178375244,
+      "sampling/sampling_logp_difference/mean": 0.018711457028985023,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 5.870488848813693e-06,
+      "clip_ratio/high_mean": 1.4676222122034233e-06,
+      "clip_ratio/low_mean": 3.637038832948747e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.783801014378696e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15942.0,
+      "completions/mean_length": 7429.3515625,
+      "completions/mean_terminated_length": 6911.31396484375,
+      "completions/min_length": 1194.0,
+      "completions/min_terminated_length": 1194.0,
+      "entropy": 0.8821266070008278,
+      "epoch": 0.36430542778288866,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002122648525983095,
+      "learning_rate": 1e-05,
+      "loss": 0.1257,
+      "num_tokens": 347462871.0,
+      "reward": 0.453125,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000076293945312,
+      "sampling/importance_sampling_ratio/min": 0.00014005196862854064,
+      "sampling/sampling_logp_difference/max": 8.873497009277344,
+      "sampling/sampling_logp_difference/mean": 0.01998838409781456,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 1.0663932243915042e-05,
+      "clip_ratio/high_mean": 2.6659830609787605e-06,
+      "clip_ratio/low_mean": 6.443337406381033e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.709935701110226e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15761.0,
+      "completions/mean_length": 7131.7109375,
+      "completions/mean_terminated_length": 6833.25,
+      "completions/min_length": 821.0,
+      "completions/min_terminated_length": 821.0,
+      "entropy": 0.8575824722647667,
+      "epoch": 0.36522539098436063,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002546454081311822,
+      "learning_rate": 1e-05,
+      "loss": 0.0676,
+      "num_tokens": 348395842.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2869499623775482,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999964714050293,
+      "sampling/importance_sampling_ratio/min": 0.0002167800412280485,
+      "sampling/sampling_logp_difference/max": 8.436627388000488,
+      "sampling/sampling_logp_difference/mean": 0.0193922221660614,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 3.847337666229578e-06,
+      "clip_ratio/high_mean": 9.618344165573944e-07,
+      "clip_ratio/low_mean": 3.932982110654848e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.029165563679271e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16200.0,
+      "completions/mean_length": 6858.34375,
+      "completions/mean_terminated_length": 6707.14306640625,
+      "completions/min_length": 772.0,
+      "completions/min_terminated_length": 772.0,
+      "entropy": 0.9539813920855522,
+      "epoch": 0.36614535418583255,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00492837093770504,
+      "learning_rate": 1e-05,
+      "loss": 0.0818,
+      "num_tokens": 349292790.0,
+      "reward": 0.390625,
+      "reward_std": 0.1949220597743988,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998850226402283,
+      "sampling/importance_sampling_ratio/min": 0.0011153683299198747,
+      "sampling/sampling_logp_difference/max": 6.79857063293457,
+      "sampling/sampling_logp_difference/mean": 0.020318543538451195,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 1.291372609557584e-05,
+      "clip_ratio/high_mean": 3.22843152389396e-06,
+      "clip_ratio/low_mean": 3.8245348378040944e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1473780811429606e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15261.0,
+      "completions/mean_length": 7809.984375,
+      "completions/mean_terminated_length": 7533.40283203125,
+      "completions/min_length": 1002.0,
+      "completions/min_terminated_length": 1002.0,
+      "entropy": 0.8353303670883179,
+      "epoch": 0.3670653173873045,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004895905964076519,
+      "learning_rate": 1e-05,
+      "loss": 0.0273,
+      "num_tokens": 350312556.0,
+      "reward": 0.3203125,
+      "reward_std": 0.22567616403102875,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999260306358337,
+      "sampling/importance_sampling_ratio/min": 0.0008417933131568134,
+      "sampling/sampling_logp_difference/max": 7.0799760818481445,
+      "sampling/sampling_logp_difference/mean": 0.018754083663225174,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 1.1250081115576904e-05,
+      "clip_ratio/high_mean": 3.5690324011738994e-06,
+      "clip_ratio/low_mean": 3.196108968950284e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.553012152224255e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15057.0,
+      "completions/mean_length": 7194.9296875,
+      "completions/mean_terminated_length": 6821.39013671875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.9744522422552109,
+      "epoch": 0.36798528058877644,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0032397822942584753,
+      "learning_rate": 1e-05,
+      "loss": 0.0402,
+      "num_tokens": 351252755.0,
+      "reward": 0.421875,
+      "reward_std": 0.19438527524471283,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998766183853149,
+      "sampling/importance_sampling_ratio/min": 0.00023159870761446655,
+      "sampling/sampling_logp_difference/max": 8.370504379272461,
+      "sampling/sampling_logp_difference/mean": 0.02105094864964485,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 6.980455509619787e-06,
+      "clip_ratio/high_mean": 1.7451138774049468e-06,
+      "clip_ratio/low_mean": 2.2670621888210007e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.441573599298863e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15745.0,
+      "completions/mean_length": 6836.234375,
+      "completions/mean_terminated_length": 6607.08837890625,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.9149863049387932,
+      "epoch": 0.3689052437902484,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031576494220644236,
+      "learning_rate": 1e-05,
+      "loss": 0.0424,
+      "num_tokens": 352145873.0,
+      "reward": 0.3671875,
+      "reward_std": 0.22225630283355713,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999266862869263,
+      "sampling/importance_sampling_ratio/min": 0.0011975533561781049,
+      "sampling/sampling_logp_difference/max": 6.727474689483643,
+      "sampling/sampling_logp_difference/mean": 0.020445333793759346,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 2.3557336589874467e-05,
+      "clip_ratio/high_mean": 5.889334147468617e-06,
+      "clip_ratio/low_mean": 5.359988131203863e-05,
+      "clip_ratio/low_min": 1.3856095392839052e-05,
+      "clip_ratio/region_mean": 5.9489215118446737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16380.0,
+      "completions/mean_length": 6942.65625,
+      "completions/mean_terminated_length": 6638.0966796875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "entropy": 0.7541583999991417,
+      "epoch": 0.36982520699172033,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.003970830701291561,
+      "learning_rate": 1e-05,
+      "loss": 0.051,
+      "num_tokens": 353056405.0,
+      "reward": 0.453125,
+      "reward_std": 0.3282659649848938,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000462532043457,
+      "sampling/importance_sampling_ratio/min": 8.399576472584158e-06,
+      "sampling/sampling_logp_difference/max": 11.687329292297363,
+      "sampling/sampling_logp_difference/mean": 0.018101349472999573,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 2.6139805413549766e-05,
+      "clip_ratio/high_mean": 7.517377525800839e-06,
+      "clip_ratio/low_mean": 1.968103515537223e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7198412681173068e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14786.0,
+      "completions/max_terminated_length": 14786.0,
+      "completions/mean_length": 6022.1875,
+      "completions/mean_terminated_length": 6022.1875,
+      "completions/min_length": 1285.0,
+      "completions/min_terminated_length": 1285.0,
+      "entropy": 0.9535745903849602,
+      "epoch": 0.37074517019319225,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0043656788766384125,
+      "learning_rate": 1e-05,
+      "loss": 0.029,
+      "num_tokens": 353844661.0,
+      "reward": 0.4140625,
+      "reward_std": 0.22225631773471832,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999772310256958,
+      "sampling/importance_sampling_ratio/min": 0.04981832951307297,
+      "sampling/sampling_logp_difference/max": 2.9993722438812256,
+      "sampling/sampling_logp_difference/mean": 0.020655371248722076,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 9.152076700047473e-06,
+      "clip_ratio/high_mean": 2.9508817647183605e-06,
+      "clip_ratio/low_mean": 5.21388310517068e-05,
+      "clip_ratio/low_min": 2.633131089169183e-06,
+      "clip_ratio/region_mean": 5.508971298695542e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15906.0,
+      "completions/mean_length": 8068.96875,
+      "completions/mean_terminated_length": 7869.408203125,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.9473539590835571,
+      "epoch": 0.3716651333946642,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006543307099491358,
+      "learning_rate": 1e-05,
+      "loss": 0.006,
+      "num_tokens": 354894689.0,
+      "reward": 0.2578125,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999514818191528,
+      "sampling/importance_sampling_ratio/min": 6.672408926533535e-05,
+      "sampling/sampling_logp_difference/max": 9.614944458007812,
+      "sampling/sampling_logp_difference/mean": 0.021852033212780952,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 2.9619268843816826e-05,
+      "clip_ratio/high_mean": 7.4048172109542065e-06,
+      "clip_ratio/low_mean": 5.5152235972855124e-05,
+      "clip_ratio/low_min": 1.0455875781190116e-05,
+      "clip_ratio/region_mean": 6.255705375224352e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15748.0,
+      "completions/mean_length": 5960.1875,
+      "completions/mean_terminated_length": 5878.1103515625,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "entropy": 0.9564141109585762,
+      "epoch": 0.37258509659613614,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003351036459207535,
+      "learning_rate": 1e-05,
+      "loss": 0.0293,
+      "num_tokens": 355677273.0,
+      "reward": 0.46875,
+      "reward_std": 0.31642353534698486,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999220371246338,
+      "sampling/importance_sampling_ratio/min": 0.0012859756825491786,
+      "sampling/sampling_logp_difference/max": 6.656237602233887,
+      "sampling/sampling_logp_difference/mean": 0.021779976785182953,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 7.957685966175632e-06,
+      "clip_ratio/high_mean": 1.989421491543908e-06,
+      "clip_ratio/low_mean": 3.758041248147492e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.956983414354909e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15669.0,
+      "completions/mean_length": 7620.21875,
+      "completions/mean_terminated_length": 7189.212890625,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 1.035948596894741,
+      "epoch": 0.3735050597976081,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031219006050378084,
+      "learning_rate": 1e-05,
+      "loss": 0.039,
+      "num_tokens": 356675829.0,
+      "reward": 0.296875,
+      "reward_std": 0.1751839816570282,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0001060962677002,
+      "sampling/importance_sampling_ratio/min": 0.010141897015273571,
+      "sampling/sampling_logp_difference/max": 4.591080188751221,
+      "sampling/sampling_logp_difference/mean": 0.021951109170913696,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 2.286768199155631e-05,
+      "clip_ratio/high_mean": 5.7169204978890775e-06,
+      "clip_ratio/low_mean": 3.914574369900947e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.486266482217616e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14038.0,
+      "completions/mean_length": 5806.0234375,
+      "completions/mean_terminated_length": 5638.119140625,
+      "completions/min_length": 1319.0,
+      "completions/min_terminated_length": 1319.0,
+      "entropy": 0.8977029845118523,
+      "epoch": 0.37442502299908004,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002810312667861581,
+      "learning_rate": 1e-05,
+      "loss": 0.0471,
+      "num_tokens": 357438712.0,
+      "reward": 0.546875,
+      "reward_std": 0.22832970321178436,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999280571937561,
+      "sampling/importance_sampling_ratio/min": 0.0011738575994968414,
+      "sampling/sampling_logp_difference/max": 6.747459888458252,
+      "sampling/sampling_logp_difference/mean": 0.01965375244617462,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 1.2219379641464911e-05,
+      "clip_ratio/high_mean": 3.054844910366228e-06,
+      "clip_ratio/low_mean": 3.186109779562685e-05,
+      "clip_ratio/low_min": 4.3511558942554984e-06,
+      "clip_ratio/region_mean": 3.4915943160740426e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15705.0,
+      "completions/max_terminated_length": 15705.0,
+      "completions/mean_length": 6537.4609375,
+      "completions/mean_terminated_length": 6537.4609375,
+      "completions/min_length": 842.0,
+      "completions/min_terminated_length": 842.0,
+      "entropy": 0.9577726796269417,
+      "epoch": 0.37534498620055196,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004516562446951866,
+      "learning_rate": 1e-05,
+      "loss": 0.0517,
+      "num_tokens": 358296731.0,
+      "reward": 0.3828125,
+      "reward_std": 0.1830746978521347,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999170303344727,
+      "sampling/importance_sampling_ratio/min": 2.384942035860149e-06,
+      "sampling/sampling_logp_difference/max": 12.946335792541504,
+      "sampling/sampling_logp_difference/mean": 0.021242395043373108,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 1.4422689218918094e-05,
+      "clip_ratio/high_mean": 3.6056723047295236e-06,
+      "clip_ratio/low_mean": 3.026239573955536e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3868068385345396e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16360.0,
+      "completions/mean_length": 7896.671875,
+      "completions/mean_terminated_length": 7622.88671875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.9163230583071709,
+      "epoch": 0.37626494940202393,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003542230697348714,
+      "learning_rate": 1e-05,
+      "loss": 0.05,
+      "num_tokens": 359327001.0,
+      "reward": 0.375,
+      "reward_std": 0.23645778000354767,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998560547828674,
+      "sampling/importance_sampling_ratio/min": 0.00010891625424847007,
+      "sampling/sampling_logp_difference/max": 9.124931335449219,
+      "sampling/sampling_logp_difference/mean": 0.020085681229829788,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 1.7827243254942005e-05,
+      "clip_ratio/high_mean": 5.474494003010477e-06,
+      "clip_ratio/low_mean": 4.2465159026505717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.793965263161226e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15297.0,
+      "completions/mean_length": 6728.7109375,
+      "completions/mean_terminated_length": 6652.68505859375,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.9010183215141296,
+      "epoch": 0.37718491260349585,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0035069347359240055,
+      "learning_rate": 1e-05,
+      "loss": 0.0518,
+      "num_tokens": 360208780.0,
+      "reward": 0.5390625,
+      "reward_std": 0.29932135343551636,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999571442604065,
+      "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05,
+      "sampling/sampling_logp_difference/max": 11.124998092651367,
+      "sampling/sampling_logp_difference/mean": 0.021022530272603035,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 1.0376989393989788e-05,
+      "clip_ratio/high_mean": 2.594247348497447e-06,
+      "clip_ratio/low_mean": 2.8587513156708155e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1181759936771414e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16167.0,
+      "completions/mean_length": 6800.3984375,
+      "completions/mean_terminated_length": 6491.25,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.8654960840940475,
+      "epoch": 0.3781048758049678,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033910400234162807,
+      "learning_rate": 1e-05,
+      "loss": 0.0221,
+      "num_tokens": 361098567.0,
+      "reward": 0.5625,
+      "reward_std": 0.2306838035583496,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998576641082764,
+      "sampling/importance_sampling_ratio/min": 0.001449413481168449,
+      "sampling/sampling_logp_difference/max": 6.536596298217773,
+      "sampling/sampling_logp_difference/mean": 0.019660964608192444,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 2.3068858354236e-05,
+      "clip_ratio/high_mean": 7.792090059410839e-06,
+      "clip_ratio/low_mean": 5.8515578757578623e-05,
+      "clip_ratio/low_min": 1.0348648629587842e-05,
+      "clip_ratio/region_mean": 6.630766870330262e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16373.0,
+      "completions/mean_length": 7103.4453125,
+      "completions/mean_terminated_length": 6956.13525390625,
+      "completions/min_length": 1711.0,
+      "completions/min_terminated_length": 1711.0,
+      "entropy": 0.8317076042294502,
+      "epoch": 0.37902483900643974,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0036110079381614923,
+      "learning_rate": 1e-05,
+      "loss": 0.0834,
+      "num_tokens": 362027520.0,
+      "reward": 0.546875,
+      "reward_std": 0.33797892928123474,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999338984489441,
+      "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05,
+      "sampling/sampling_logp_difference/max": 11.458046913146973,
+      "sampling/sampling_logp_difference/mean": 0.01939362846314907,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 3.112394779236638e-06,
+      "clip_ratio/high_mean": 7.780986948091595e-07,
+      "clip_ratio/low_mean": 5.127149995587388e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.204959859383962e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15830.0,
+      "completions/mean_length": 7344.9296875,
+      "completions/mean_terminated_length": 6900.384765625,
+      "completions/min_length": 1368.0,
+      "completions/min_terminated_length": 1368.0,
+      "entropy": 0.8387318029999733,
+      "epoch": 0.37994480220791166,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002141098491847515,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 362985207.0,
+      "reward": 0.34375,
+      "reward_std": 0.28930896520614624,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999322891235352,
+      "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05,
+      "sampling/sampling_logp_difference/max": 10.874617576599121,
+      "sampling/sampling_logp_difference/mean": 0.01929464004933834,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 5.2602786126954015e-06,
+      "clip_ratio/high_mean": 1.3150696531738504e-06,
+      "clip_ratio/low_mean": 1.7854434247510653e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.9169503786997666e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16137.0,
+      "completions/mean_length": 6377.7734375,
+      "completions/mean_terminated_length": 6218.94482421875,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "entropy": 0.9732858911156654,
+      "epoch": 0.38086476540938363,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015244127716869116,
+      "learning_rate": 1e-05,
+      "loss": 0.0608,
+      "num_tokens": 363823914.0,
+      "reward": 0.4375,
+      "reward_std": 0.1988610327243805,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999363422393799,
+      "sampling/importance_sampling_ratio/min": 0.006335465237498283,
+      "sampling/sampling_logp_difference/max": 5.061592102050781,
+      "sampling/sampling_logp_difference/mean": 0.020688029006123543,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 2.6195500595349586e-05,
+      "clip_ratio/high_mean": 6.548875148837396e-06,
+      "clip_ratio/low_mean": 3.3802934012783226e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035180882056011e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14456.0,
+      "completions/mean_length": 5599.7890625,
+      "completions/mean_terminated_length": 5340.96826171875,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.8872368410229683,
+      "epoch": 0.38178472861085555,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002647512126713991,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 364561127.0,
+      "reward": 0.453125,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999077916145325,
+      "sampling/importance_sampling_ratio/min": 2.370526999584399e-06,
+      "sampling/sampling_logp_difference/max": 12.952398300170898,
+      "sampling/sampling_logp_difference/mean": 0.01878243312239647,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 2.157278959202813e-05,
+      "clip_ratio/high_mean": 5.3931973980070325e-06,
+      "clip_ratio/low_mean": 7.215861739950924e-05,
+      "clip_ratio/low_min": 1.4898997051204788e-05,
+      "clip_ratio/region_mean": 7.755181559332414e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15905.0,
+      "completions/mean_length": 7877.2890625,
+      "completions/mean_terminated_length": 7385.1650390625,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.8416353687644005,
+      "epoch": 0.3827046918123275,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0018051012884825468,
+      "learning_rate": 1e-05,
+      "loss": 0.0541,
+      "num_tokens": 365590124.0,
+      "reward": 0.3125,
+      "reward_std": 0.28407180309295654,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999774694442749,
+      "sampling/importance_sampling_ratio/min": 0.0004095165350008756,
+      "sampling/sampling_logp_difference/max": 7.800533294677734,
+      "sampling/sampling_logp_difference/mean": 0.019809434190392494,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 2.540994637456606e-05,
+      "clip_ratio/high_mean": 6.352486593641515e-06,
+      "clip_ratio/low_mean": 4.230594890941575e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8658435844117776e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16083.0,
+      "completions/mean_length": 6836.7890625,
+      "completions/mean_terminated_length": 6200.30859375,
+      "completions/min_length": 909.0,
+      "completions/min_terminated_length": 909.0,
+      "entropy": 0.8647575601935387,
+      "epoch": 0.38362465501379944,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004550795070827007,
+      "learning_rate": 1e-05,
+      "loss": 0.0146,
+      "num_tokens": 366486337.0,
+      "reward": 0.40625,
+      "reward_std": 0.22620806097984314,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999873638153076,
+      "sampling/importance_sampling_ratio/min": 0.0001089095021598041,
+      "sampling/sampling_logp_difference/max": 9.124993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01992485672235489,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 1.1592664577619871e-05,
+      "clip_ratio/high_mean": 2.8981661444049678e-06,
+      "clip_ratio/low_mean": 3.5717548257707676e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.861571451579948e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16286.0,
+      "completions/mean_length": 6884.953125,
+      "completions/mean_terminated_length": 6417.78662109375,
+      "completions/min_length": 1289.0,
+      "completions/min_terminated_length": 1289.0,
+      "entropy": 0.8691708743572235,
+      "epoch": 0.3845446182152714,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005958946421742439,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 367386163.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000011920928955,
+      "sampling/importance_sampling_ratio/min": 9.519772902422119e-06,
+      "sampling/sampling_logp_difference/max": 11.562139511108398,
+      "sampling/sampling_logp_difference/mean": 0.019436441361904144,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 2.7658640192385064e-05,
+      "clip_ratio/high_mean": 8.455849524580117e-06,
+      "clip_ratio/low_mean": 3.938097847822064e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7836828116487595e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15574.0,
+      "completions/mean_length": 7439.1328125,
+      "completions/mean_terminated_length": 7150.58837890625,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 0.795464999973774,
+      "epoch": 0.38546458141674333,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00558120384812355,
+      "learning_rate": 1e-05,
+      "loss": 0.1918,
+      "num_tokens": 368357500.0,
+      "reward": 0.609375,
+      "reward_std": 0.3795146346092224,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999570250511169,
+      "sampling/importance_sampling_ratio/min": 0.0001159337698481977,
+      "sampling/sampling_logp_difference/max": 9.062491416931152,
+      "sampling/sampling_logp_difference/mean": 0.018824251368641853,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 8.509555527780321e-06,
+      "clip_ratio/high_mean": 2.1273888819450804e-06,
+      "clip_ratio/low_mean": 3.0958593640662e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.308598269313734e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16236.0,
+      "completions/mean_length": 6751.53125,
+      "completions/mean_terminated_length": 6520.3525390625,
+      "completions/min_length": 715.0,
+      "completions/min_terminated_length": 715.0,
+      "entropy": 0.9450879693031311,
+      "epoch": 0.38638454461821525,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004628168884664774,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "num_tokens": 369242920.0,
+      "reward": 0.359375,
+      "reward_std": 0.20859163999557495,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999655485153198,
+      "sampling/importance_sampling_ratio/min": 0.0006074689445085824,
+      "sampling/sampling_logp_difference/max": 7.406209468841553,
+      "sampling/sampling_logp_difference/mean": 0.019376013427972794,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 1.8288420505996328e-05,
+      "clip_ratio/high_mean": 4.572105126499082e-06,
+      "clip_ratio/low_mean": 4.86290555272717e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.320115997164976e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16164.0,
+      "completions/mean_length": 7023.296875,
+      "completions/mean_terminated_length": 6315.3447265625,
+      "completions/min_length": 1628.0,
+      "completions/min_terminated_length": 1628.0,
+      "entropy": 0.7378111630678177,
+      "epoch": 0.3873045078196872,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.00389425759203732,
+      "learning_rate": 1e-05,
+      "loss": 0.1066,
+      "num_tokens": 370159510.0,
+      "reward": 0.4921875,
+      "reward_std": 0.323777437210083,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999127388000488,
+      "sampling/importance_sampling_ratio/min": 0.00014012664905749261,
+      "sampling/sampling_logp_difference/max": 8.872963905334473,
+      "sampling/sampling_logp_difference/mean": 0.016914553940296173,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 2.1269573153404053e-05,
+      "clip_ratio/high_mean": 5.948400371380558e-06,
+      "clip_ratio/low_mean": 2.3538930747690756e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9487331687505502e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16018.0,
+      "completions/max_terminated_length": 16018.0,
+      "completions/mean_length": 7702.3046875,
+      "completions/mean_terminated_length": 7702.3046875,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "entropy": 0.9053447172045708,
+      "epoch": 0.38822447102115915,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004324545152485371,
+      "learning_rate": 1e-05,
+      "loss": 0.0149,
+      "num_tokens": 371162773.0,
+      "reward": 0.2421875,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.00001060962677,
+      "sampling/importance_sampling_ratio/min": 2.283278627146501e-05,
+      "sampling/sampling_logp_difference/max": 10.687313079833984,
+      "sampling/sampling_logp_difference/mean": 0.020495830103754997,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 1.0294916819475475e-05,
+      "clip_ratio/high_mean": 2.5737292048688687e-06,
+      "clip_ratio/low_mean": 5.831611520079605e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.088984559937671e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15295.0,
+      "completions/mean_length": 6904.78125,
+      "completions/mean_terminated_length": 6754.31787109375,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.7991176024079323,
+      "epoch": 0.3891444342226311,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003239463549107313,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "num_tokens": 372067241.0,
+      "reward": 0.328125,
+      "reward_std": 0.32719242572784424,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 0.00012340991816017777,
+      "sampling/sampling_logp_difference/max": 8.999999046325684,
+      "sampling/sampling_logp_difference/mean": 0.019042208790779114,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 2.7261318791715894e-05,
+      "clip_ratio/high_mean": 7.926559305815317e-06,
+      "clip_ratio/low_mean": 1.552133551285806e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.3447895273420727e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15399.0,
+      "completions/mean_length": 6107.7421875,
+      "completions/mean_terminated_length": 5602.35205078125,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "entropy": 0.9495253190398216,
+      "epoch": 0.39006439742410304,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015464330790564418,
+      "learning_rate": 1e-05,
+      "loss": 0.0587,
+      "num_tokens": 372866072.0,
+      "reward": 0.421875,
+      "reward_std": 0.1820138692855835,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999971330165863,
+      "sampling/importance_sampling_ratio/min": 0.00024684349773451686,
+      "sampling/sampling_logp_difference/max": 8.306756019592285,
+      "sampling/sampling_logp_difference/mean": 0.019793221727013588,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 2.457227401464479e-05,
+      "clip_ratio/high_mean": 8.533324717063806e-06,
+      "clip_ratio/low_mean": 3.261690835643094e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.115023284612107e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15939.0,
+      "completions/mean_length": 6079.8046875,
+      "completions/mean_terminated_length": 5747.4111328125,
+      "completions/min_length": 1082.0,
+      "completions/min_terminated_length": 1082.0,
+      "entropy": 0.8005363270640373,
+      "epoch": 0.39098436062557496,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024811832699924707,
+      "learning_rate": 1e-05,
+      "loss": 0.1124,
+      "num_tokens": 373663463.0,
+      "reward": 0.625,
+      "reward_std": 0.2630355656147003,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999743103981018,
+      "sampling/importance_sampling_ratio/min": 0.00019348970090504736,
+      "sampling/sampling_logp_difference/max": 8.550286293029785,
+      "sampling/sampling_logp_difference/mean": 0.017151469364762306,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 3.3719989005476236e-06,
+      "clip_ratio/high_mean": 8.429997251369059e-07,
+      "clip_ratio/low_mean": 2.132218082806503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2165180553201935e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14925.0,
+      "completions/mean_length": 6453.7890625,
+      "completions/mean_terminated_length": 6375.5986328125,
+      "completions/min_length": 347.0,
+      "completions/min_terminated_length": 347.0,
+      "entropy": 0.9212624430656433,
+      "epoch": 0.39190432382704693,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031475063879042864,
+      "learning_rate": 1e-05,
+      "loss": 0.0959,
+      "num_tokens": 374517492.0,
+      "reward": 0.34375,
+      "reward_std": 0.19910329580307007,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999594688415527,
+      "sampling/importance_sampling_ratio/min": 0.015664709731936455,
+      "sampling/sampling_logp_difference/max": 4.156344890594482,
+      "sampling/sampling_logp_difference/mean": 0.019899867475032806,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 1.907509408738406e-05,
+      "clip_ratio/high_mean": 5.984868664654641e-06,
+      "clip_ratio/low_mean": 3.784128080042137e-05,
+      "clip_ratio/low_min": 3.7751804029539926e-06,
+      "clip_ratio/region_mean": 4.382614952191943e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16159.0,
+      "completions/max_terminated_length": 16159.0,
+      "completions/mean_length": 6126.9921875,
+      "completions/mean_terminated_length": 6126.9921875,
+      "completions/min_length": 1106.0,
+      "completions/min_terminated_length": 1106.0,
+      "entropy": 0.8252849578857422,
+      "epoch": 0.39282428702851885,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004200868774205446,
+      "learning_rate": 1e-05,
+      "loss": 0.0276,
+      "num_tokens": 375320339.0,
+      "reward": 0.4140625,
+      "reward_std": 0.1830747127532959,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999815225601196,
+      "sampling/importance_sampling_ratio/min": 0.005763276945799589,
+      "sampling/sampling_logp_difference/max": 5.156249046325684,
+      "sampling/sampling_logp_difference/mean": 0.01833093911409378,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 1.8918785372079583e-05,
+      "clip_ratio/high_mean": 5.476571459439583e-06,
+      "clip_ratio/low_mean": 6.169724406390742e-05,
+      "clip_ratio/low_min": 7.494657666029525e-06,
+      "clip_ratio/region_mean": 6.717381506859965e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15411.0,
+      "completions/mean_length": 6739.09375,
+      "completions/mean_terminated_length": 6427.9677734375,
+      "completions/min_length": 1228.0,
+      "completions/min_terminated_length": 1228.0,
+      "entropy": 0.8008574098348618,
+      "epoch": 0.3937442502299908,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003204014617949724,
+      "learning_rate": 1e-05,
+      "loss": 0.0481,
+      "num_tokens": 376201015.0,
+      "reward": 0.5390625,
+      "reward_std": 0.37086254358291626,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998303651809692,
+      "sampling/importance_sampling_ratio/min": 0.00010144581028725952,
+      "sampling/sampling_logp_difference/max": 9.195985794067383,
+      "sampling/sampling_logp_difference/mean": 0.018961725756525993,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 1.3558789078160771e-05,
+      "clip_ratio/high_mean": 3.389697269540193e-06,
+      "clip_ratio/low_mean": 5.3925050679026754e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.731474743697618e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15634.0,
+      "completions/mean_length": 7245.8984375,
+      "completions/mean_terminated_length": 6951.12060546875,
+      "completions/min_length": 1306.0,
+      "completions/min_terminated_length": 1306.0,
+      "entropy": 1.0351596996188164,
+      "epoch": 0.39466421343146274,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0039763906970620155,
+      "learning_rate": 1e-05,
+      "loss": 0.0299,
+      "num_tokens": 377149650.0,
+      "reward": 0.375,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000600814819336,
+      "sampling/importance_sampling_ratio/min": 8.106228051474318e-05,
+      "sampling/sampling_logp_difference/max": 9.420292854309082,
+      "sampling/sampling_logp_difference/mean": 0.020948028191924095,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 1.4580486549675697e-05,
+      "clip_ratio/high_mean": 4.259903903403028e-06,
+      "clip_ratio/low_mean": 4.6149686397711775e-05,
+      "clip_ratio/low_min": 3.006686938533676e-06,
+      "clip_ratio/region_mean": 5.04095905853319e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15608.0,
+      "completions/mean_length": 6958.625,
+      "completions/mean_terminated_length": 6495.08154296875,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.8360240310430527,
+      "epoch": 0.39558417663293466,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0031417158897966146,
+      "learning_rate": 1e-05,
+      "loss": 0.0195,
+      "num_tokens": 378057802.0,
+      "reward": 0.515625,
+      "reward_std": 0.35771697759628296,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999384880065918,
+      "sampling/importance_sampling_ratio/min": 0.00010235882655251771,
+      "sampling/sampling_logp_difference/max": 9.187026023864746,
+      "sampling/sampling_logp_difference/mean": 0.019185224547982216,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 6.681633749394678e-06,
+      "clip_ratio/high_mean": 1.6704084373486694e-06,
+      "clip_ratio/low_mean": 5.096616632727091e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.263657521936693e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15410.0,
+      "completions/max_terminated_length": 15410.0,
+      "completions/mean_length": 5696.3984375,
+      "completions/mean_terminated_length": 5696.3984375,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.7887749597430229,
+      "epoch": 0.39650413983440663,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004943124484270811,
+      "learning_rate": 1e-05,
+      "loss": 0.096,
+      "num_tokens": 378808021.0,
+      "reward": 0.515625,
+      "reward_std": 0.31246691942214966,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999057054519653,
+      "sampling/importance_sampling_ratio/min": 0.0015042300801724195,
+      "sampling/sampling_logp_difference/max": 6.499474048614502,
+      "sampling/sampling_logp_difference/mean": 0.018845941871404648,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 1.7526824194646906e-05,
+      "clip_ratio/high_mean": 5.417880970526312e-06,
+      "clip_ratio/low_mean": 3.513921649300755e-05,
+      "clip_ratio/low_min": 6.075038982089609e-06,
+      "clip_ratio/region_mean": 4.0557096895099676e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14233.0,
+      "completions/mean_length": 6480.8828125,
+      "completions/mean_terminated_length": 6323.69091796875,
+      "completions/min_length": 1013.0,
+      "completions/min_terminated_length": 1013.0,
+      "entropy": 0.8796411231160164,
+      "epoch": 0.39742410303587855,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00595651101320982,
+      "learning_rate": 1e-05,
+      "loss": 0.0546,
+      "num_tokens": 379659710.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998855590820312,
+      "sampling/importance_sampling_ratio/min": 0.0017907419241964817,
+      "sampling/sampling_logp_difference/max": 6.325125217437744,
+      "sampling/sampling_logp_difference/mean": 0.01906527951359749,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.4512424602107785e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.4512424602107785e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16204.0,
+      "completions/mean_length": 7501.703125,
+      "completions/mean_terminated_length": 6829.93310546875,
+      "completions/min_length": 680.0,
+      "completions/min_terminated_length": 680.0,
+      "entropy": 0.786028303205967,
+      "epoch": 0.3983440662373505,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.0024527597706764936,
+      "learning_rate": 1e-05,
+      "loss": 0.0683,
+      "num_tokens": 380640720.0,
+      "reward": 0.5234375,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999595880508423,
+      "sampling/importance_sampling_ratio/min": 8.851602615322918e-07,
+      "sampling/sampling_logp_difference/max": 13.93749713897705,
+      "sampling/sampling_logp_difference/mean": 0.01873261108994484,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 1.4606259583160863e-05,
+      "clip_ratio/high_mean": 5.505394312876888e-06,
+      "clip_ratio/low_mean": 3.1679782978244475e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7185177234277944e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15185.0,
+      "completions/mean_length": 5619.2890625,
+      "completions/mean_terminated_length": 5448.4208984375,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.8098893761634827,
+      "epoch": 0.39926402943882244,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004280989523977041,
+      "learning_rate": 1e-05,
+      "loss": 0.0514,
+      "num_tokens": 381377981.0,
+      "reward": 0.609375,
+      "reward_std": 0.2398776412010193,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999443292617798,
+      "sampling/importance_sampling_ratio/min": 0.0010248658945783973,
+      "sampling/sampling_logp_difference/max": 6.883193492889404,
+      "sampling/sampling_logp_difference/mean": 0.017923470586538315,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 1.4808703554081148e-05,
+      "clip_ratio/high_mean": 3.702175888520287e-06,
+      "clip_ratio/low_mean": 2.3637440563106793e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.7339616224253405e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5243.8203125,
+      "completions/mean_terminated_length": 5156.1025390625,
+      "completions/min_length": 576.0,
+      "completions/min_terminated_length": 576.0,
+      "entropy": 0.7485036551952362,
+      "epoch": 0.40018399264029436,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004721642471849918,
+      "learning_rate": 1e-05,
+      "loss": 0.0877,
+      "num_tokens": 382070478.0,
+      "reward": 0.6875,
+      "reward_std": 0.26538965106010437,
+      "rewards/accuracy_reward/mean": 0.6875,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999414086341858,
+      "sampling/importance_sampling_ratio/min": 0.0011518355458974838,
+      "sampling/sampling_logp_difference/max": 6.7663984298706055,
+      "sampling/sampling_logp_difference/mean": 0.016579966992139816,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 3.1177480195765384e-05,
+      "clip_ratio/high_mean": 1.1174359769938746e-05,
+      "clip_ratio/low_mean": 3.602651599976525e-05,
+      "clip_ratio/low_min": 4.348733455117326e-06,
+      "clip_ratio/region_mean": 4.720087713394605e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15978.0,
+      "completions/mean_length": 7021.1796875,
+      "completions/mean_terminated_length": 6872.56396484375,
+      "completions/min_length": 1371.0,
+      "completions/min_terminated_length": 1371.0,
+      "entropy": 0.8693460151553154,
+      "epoch": 0.40110395584176634,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00329192029312253,
+      "learning_rate": 1e-05,
+      "loss": 0.0342,
+      "num_tokens": 382990245.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999822378158569,
+      "sampling/importance_sampling_ratio/min": 0.0023386883549392223,
+      "sampling/sampling_logp_difference/max": 6.058165073394775,
+      "sampling/sampling_logp_difference/mean": 0.019863136112689972,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 1.1192694955752813e-05,
+      "clip_ratio/high_mean": 2.7981737389382033e-06,
+      "clip_ratio/low_mean": 4.9078003257818636e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.1876177280973934e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15344.0,
+      "completions/mean_length": 6917.625,
+      "completions/mean_terminated_length": 6452.0654296875,
+      "completions/min_length": 945.0,
+      "completions/min_terminated_length": 945.0,
+      "entropy": 0.8466897681355476,
+      "epoch": 0.40202391904323825,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0051889242604374886,
+      "learning_rate": 1e-05,
+      "loss": 0.1009,
+      "num_tokens": 383896717.0,
+      "reward": 0.4140625,
+      "reward_std": 0.3448137044906616,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999983310699463,
+      "sampling/importance_sampling_ratio/min": 0.00015846389578655362,
+      "sampling/sampling_logp_difference/max": 8.749983787536621,
+      "sampling/sampling_logp_difference/mean": 0.019528398290276527,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 2.3224948108691024e-05,
+      "clip_ratio/high_mean": 8.263948757303297e-06,
+      "clip_ratio/low_mean": 3.8556312347282073e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.682026019509067e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16175.0,
+      "completions/mean_length": 7487.5078125,
+      "completions/mean_terminated_length": 7346.2939453125,
+      "completions/min_length": 877.0,
+      "completions/min_terminated_length": 877.0,
+      "entropy": 0.9584660083055496,
+      "epoch": 0.4029438822447102,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002855573548004031,
+      "learning_rate": 1e-05,
+      "loss": 0.0087,
+      "num_tokens": 384872622.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2477683424949646,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999386668205261,
+      "sampling/importance_sampling_ratio/min": 0.0038593418430536985,
+      "sampling/sampling_logp_difference/max": 5.557258605957031,
+      "sampling/sampling_logp_difference/mean": 0.0209865253418684,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 6.171620498207631e-06,
+      "clip_ratio/high_mean": 1.5429051245519076e-06,
+      "clip_ratio/low_mean": 2.98128834401723e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.135578845103737e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16092.0,
+      "completions/mean_length": 6637.5078125,
+      "completions/mean_terminated_length": 6323.1044921875,
+      "completions/min_length": 998.0,
+      "completions/min_terminated_length": 998.0,
+      "entropy": 0.8841215297579765,
+      "epoch": 0.40386384544618215,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004437311552464962,
+      "learning_rate": 1e-05,
+      "loss": 0.0523,
+      "num_tokens": 385744023.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2603819966316223,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999136924743652,
+      "sampling/importance_sampling_ratio/min": 0.002925124252215028,
+      "sampling/sampling_logp_difference/max": 5.834418296813965,
+      "sampling/sampling_logp_difference/mean": 0.019490888342261314,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 1.3304874300956726e-05,
+      "clip_ratio/high_mean": 3.3262185752391815e-06,
+      "clip_ratio/low_mean": 5.443932013804442e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.776553894065728e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15143.0,
+      "completions/mean_length": 5965.9765625,
+      "completions/mean_terminated_length": 5800.611328125,
+      "completions/min_length": 621.0,
+      "completions/min_terminated_length": 621.0,
+      "entropy": 0.8726934269070625,
+      "epoch": 0.4047838086476541,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002463799435645342,
+      "learning_rate": 1e-05,
+      "loss": -0.0075,
+      "num_tokens": 386525492.0,
+      "reward": 0.3984375,
+      "reward_std": 0.30457615852355957,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999351501464844,
+      "sampling/importance_sampling_ratio/min": 0.00020367901015561074,
+      "sampling/sampling_logp_difference/max": 8.4989652633667,
+      "sampling/sampling_logp_difference/mean": 0.01946769654750824,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 1.0084711902891286e-05,
+      "clip_ratio/high_mean": 3.6154040117253317e-06,
+      "clip_ratio/low_mean": 3.598771945689805e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9603123695997056e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6693.109375,
+      "completions/mean_terminated_length": 6616.80322265625,
+      "completions/min_length": 1704.0,
+      "completions/min_terminated_length": 1704.0,
+      "entropy": 0.9430640190839767,
+      "epoch": 0.40570377184912604,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038990566972643137,
+      "learning_rate": 1e-05,
+      "loss": 0.0415,
+      "num_tokens": 387404842.0,
+      "reward": 0.421875,
+      "reward_std": 0.31587693095207214,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999700784683228,
+      "sampling/importance_sampling_ratio/min": 0.0011708902893587947,
+      "sampling/sampling_logp_difference/max": 6.749990940093994,
+      "sampling/sampling_logp_difference/mean": 0.020848294720053673,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 7.462686426151777e-06,
+      "clip_ratio/high_mean": 1.8656716065379442e-06,
+      "clip_ratio/low_mean": 5.234285907818048e-05,
+      "clip_ratio/low_min": 4.47803950009984e-06,
+      "clip_ratio/region_mean": 5.420853057103159e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16162.0,
+      "completions/mean_length": 7045.6953125,
+      "completions/mean_terminated_length": 6505.46240234375,
+      "completions/min_length": 926.0,
+      "completions/min_terminated_length": 926.0,
+      "entropy": 0.8912066072225571,
+      "epoch": 0.40662373505059796,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0018510994268581271,
+      "learning_rate": 1e-05,
+      "loss": 0.099,
+      "num_tokens": 388324475.0,
+      "reward": 0.40625,
+      "reward_std": 0.32195523381233215,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999024868011475,
+      "sampling/importance_sampling_ratio/min": 0.0031757301185280085,
+      "sampling/sampling_logp_difference/max": 5.752217769622803,
+      "sampling/sampling_logp_difference/mean": 0.020547039806842804,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 2.504527083146968e-05,
+      "clip_ratio/high_mean": 6.26131770786742e-06,
+      "clip_ratio/low_mean": 6.165269871871715e-05,
+      "clip_ratio/low_min": 3.5272871627967106e-06,
+      "clip_ratio/region_mean": 6.791401551708987e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15734.0,
+      "completions/mean_length": 7480.0078125,
+      "completions/mean_terminated_length": 7266.3125,
+      "completions/min_length": 1130.0,
+      "completions/min_terminated_length": 1130.0,
+      "entropy": 0.8813760280609131,
+      "epoch": 0.40754369825206993,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004439481534063816,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 389305644.0,
+      "reward": 0.34375,
+      "reward_std": 0.31300368905067444,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999762773513794,
+      "sampling/importance_sampling_ratio/min": 0.007449973840266466,
+      "sampling/sampling_logp_difference/max": 4.899544715881348,
+      "sampling/sampling_logp_difference/mean": 0.01973455585539341,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 4.0980917219712865e-06,
+      "clip_ratio/high_mean": 1.0245229304928216e-06,
+      "clip_ratio/low_mean": 3.662567087303614e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.76501939172158e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15302.0,
+      "completions/max_terminated_length": 15302.0,
+      "completions/mean_length": 7044.4453125,
+      "completions/mean_terminated_length": 7044.4453125,
+      "completions/min_length": 1229.0,
+      "completions/min_terminated_length": 1229.0,
+      "entropy": 0.9901906549930573,
+      "epoch": 0.40846366145354185,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.004181519150733948,
+      "learning_rate": 1e-05,
+      "loss": -0.0068,
+      "num_tokens": 390229373.0,
+      "reward": 0.421875,
+      "reward_std": 0.17700131237506866,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000314712524414,
+      "sampling/importance_sampling_ratio/min": 0.00022536676260642707,
+      "sampling/sampling_logp_difference/max": 8.397781372070312,
+      "sampling/sampling_logp_difference/mean": 0.021211043000221252,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 1.4909872106727562e-05,
+      "clip_ratio/high_mean": 3.7274680266818905e-06,
+      "clip_ratio/low_mean": 5.29995777469594e-05,
+      "clip_ratio/low_min": 3.708758640641463e-06,
+      "clip_ratio/region_mean": 5.672704537573736e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7815.8125,
+      "completions/mean_terminated_length": 7244.6005859375,
+      "completions/min_length": 1350.0,
+      "completions/min_terminated_length": 1350.0,
+      "entropy": 0.8278292864561081,
+      "epoch": 0.4093836246550138,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002691390924155712,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 391251141.0,
+      "reward": 0.3515625,
+      "reward_std": 0.31222954392433167,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.99993896484375,
+      "sampling/importance_sampling_ratio/min": 0.007715471088886261,
+      "sampling/sampling_logp_difference/max": 4.864527702331543,
+      "sampling/sampling_logp_difference/mean": 0.018415704369544983,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 2.1858722902834415e-05,
+      "clip_ratio/high_mean": 6.629899417021079e-06,
+      "clip_ratio/low_mean": 3.196247394043894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.859237290271267e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15202.0,
+      "completions/mean_length": 5305.1796875,
+      "completions/mean_terminated_length": 5217.94482421875,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.8100772425532341,
+      "epoch": 0.41030358785648574,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0069543467834591866,
+      "learning_rate": 1e-05,
+      "loss": 0.1153,
+      "num_tokens": 391956196.0,
+      "reward": 0.609375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000190734863281,
+      "sampling/importance_sampling_ratio/min": 0.0024869756307452917,
+      "sampling/sampling_logp_difference/max": 5.996687889099121,
+      "sampling/sampling_logp_difference/mean": 0.017318082973361015,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 2.461934036546154e-05,
+      "clip_ratio/high_mean": 8.056288947955181e-06,
+      "clip_ratio/low_mean": 5.289376917971822e-05,
+      "clip_ratio/low_min": 4.21926688431995e-06,
+      "clip_ratio/region_mean": 6.0950058468733914e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15300.0,
+      "completions/mean_length": 7299.578125,
+      "completions/mean_terminated_length": 6930.29248046875,
+      "completions/min_length": 1008.0,
+      "completions/min_terminated_length": 1008.0,
+      "entropy": 0.9955824315547943,
+      "epoch": 0.41122355105795766,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0065611582249403,
+      "learning_rate": 1e-05,
+      "loss": 0.0883,
+      "num_tokens": 392908430.0,
+      "reward": 0.4375,
+      "reward_std": 0.304571270942688,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999696016311646,
+      "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06,
+      "sampling/sampling_logp_difference/max": 11.873339653015137,
+      "sampling/sampling_logp_difference/mean": 0.02127375639975071,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 2.4339562514796853e-05,
+      "clip_ratio/high_mean": 7.412756531266496e-06,
+      "clip_ratio/low_mean": 3.89272447591793e-05,
+      "clip_ratio/low_min": 4.047796210215893e-06,
+      "clip_ratio/region_mean": 4.6340001517819474e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 6702.9375,
+      "completions/mean_terminated_length": 6390.64501953125,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.82919991761446,
+      "epoch": 0.41214351425942963,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032975098583847284,
+      "learning_rate": 1e-05,
+      "loss": 0.0725,
+      "num_tokens": 393788286.0,
+      "reward": 0.4609375,
+      "reward_std": 0.27168765664100647,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999115467071533,
+      "sampling/importance_sampling_ratio/min": 0.00028582560480572283,
+      "sampling/sampling_logp_difference/max": 8.160128593444824,
+      "sampling/sampling_logp_difference/mean": 0.019461583346128464,
+      "step": 448
+    },
+    {
+      "clip_ratio/high_max": 2.3807599063729867e-05,
+      "clip_ratio/high_mean": 5.951899765932467e-06,
+      "clip_ratio/low_mean": 3.195798365140945e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.790988330365508e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15244.0,
+      "completions/mean_length": 6468.9453125,
+      "completions/mean_terminated_length": 5536.7607421875,
+      "completions/min_length": 808.0,
+      "completions/min_terminated_length": 808.0,
+      "entropy": 0.6471721827983856,
+      "epoch": 0.41306347746090155,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032787907402962446,
+      "learning_rate": 1e-05,
+      "loss": 0.1149,
+      "num_tokens": 394638159.0,
+      "reward": 0.625,
+      "reward_std": 0.25354722142219543,
+      "rewards/accuracy_reward/mean": 0.625,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999669790267944,
+      "sampling/importance_sampling_ratio/min": 0.00012341380352154374,
+      "sampling/sampling_logp_difference/max": 8.999967575073242,
+      "sampling/sampling_logp_difference/mean": 0.016151495277881622,
+      "step": 449
+    },
+    {
+      "clip_ratio/high_max": 2.247072688987828e-05,
+      "clip_ratio/high_mean": 5.61768172246957e-06,
+      "clip_ratio/low_mean": 6.035319393049576e-05,
+      "clip_ratio/low_min": 4.063190772285452e-06,
+      "clip_ratio/region_mean": 6.597087667614687e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15931.0,
+      "completions/mean_length": 6547.3203125,
+      "completions/mean_terminated_length": 6230.0078125,
+      "completions/min_length": 587.0,
+      "completions/min_terminated_length": 587.0,
+      "entropy": 0.9123960956931114,
+      "epoch": 0.4139834406623735,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038375966250896454,
+      "learning_rate": 1e-05,
+      "loss": 0.0967,
+      "num_tokens": 395493872.0,
+      "reward": 0.4296875,
+      "reward_std": 0.30798619985580444,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999747276306152,
+      "sampling/importance_sampling_ratio/min": 0.00016009423416107893,
+      "sampling/sampling_logp_difference/max": 8.739748001098633,
+      "sampling/sampling_logp_difference/mean": 0.019957344979047775,
+      "step": 450
+    },
+    {
+      "clip_ratio/high_max": 1.404482372890925e-05,
+      "clip_ratio/high_mean": 3.5112059322273126e-06,
+      "clip_ratio/low_mean": 2.315102483407827e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6662230766305584e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15058.0,
+      "completions/mean_length": 6291.859375,
+      "completions/mean_terminated_length": 6131.6669921875,
+      "completions/min_length": 823.0,
+      "completions/min_terminated_length": 823.0,
+      "entropy": 0.9841655194759369,
+      "epoch": 0.41490340386384544,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003903903067111969,
+      "learning_rate": 1e-05,
+      "loss": 0.0656,
+      "num_tokens": 396320254.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2569621503353119,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999452829360962,
+      "sampling/importance_sampling_ratio/min": 6.564632712979801e-06,
+      "sampling/sampling_logp_difference/max": 11.93381404876709,
+      "sampling/sampling_logp_difference/mean": 0.020753150805830956,
+      "step": 451
+    },
+    {
+      "clip_ratio/high_max": 1.5189204987109406e-05,
+      "clip_ratio/high_mean": 4.615214265868417e-06,
+      "clip_ratio/low_mean": 3.547988831087423e-05,
+      "clip_ratio/low_min": 3.3967392027989263e-06,
+      "clip_ratio/region_mean": 4.009510257674265e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15966.0,
+      "completions/mean_length": 7692.4296875,
+      "completions/mean_terminated_length": 7339.11376953125,
+      "completions/min_length": 1269.0,
+      "completions/min_terminated_length": 1269.0,
+      "entropy": 0.94080401211977,
+      "epoch": 0.41582336706531736,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005152889993041754,
+      "learning_rate": 1e-05,
+      "loss": 0.0511,
+      "num_tokens": 397327029.0,
+      "reward": 0.390625,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999433755874634,
+      "sampling/importance_sampling_ratio/min": 5.027571751270443e-05,
+      "sampling/sampling_logp_difference/max": 9.897988319396973,
+      "sampling/sampling_logp_difference/mean": 0.02036213129758835,
+      "step": 452
+    },
+    {
+      "clip_ratio/high_max": 1.733157705530175e-05,
+      "clip_ratio/high_mean": 6.0586507970583625e-06,
+      "clip_ratio/low_mean": 2.335082047011383e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9409470812424843e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15305.0,
+      "completions/mean_length": 6968.0859375,
+      "completions/mean_terminated_length": 6742.1044921875,
+      "completions/min_length": 893.0,
+      "completions/min_terminated_length": 893.0,
+      "entropy": 0.9254838973283768,
+      "epoch": 0.41674333026678934,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035838852636516094,
+      "learning_rate": 1e-05,
+      "loss": 0.0182,
+      "num_tokens": 398237536.0,
+      "reward": 0.484375,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000159740447998,
+      "sampling/importance_sampling_ratio/min": 0.002404628787189722,
+      "sampling/sampling_logp_difference/max": 6.030359745025635,
+      "sampling/sampling_logp_difference/mean": 0.020200733095407486,
+      "step": 453
+    },
+    {
+      "clip_ratio/high_max": 4.464923677005572e-06,
+      "clip_ratio/high_mean": 1.116230919251393e-06,
+      "clip_ratio/low_mean": 3.311113533754906e-05,
+      "clip_ratio/low_min": 6.725854291289579e-06,
+      "clip_ratio/region_mean": 3.422736637048729e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16309.0,
+      "completions/mean_length": 8711.078125,
+      "completions/mean_terminated_length": 8199.55078125,
+      "completions/min_length": 1049.0,
+      "completions/min_terminated_length": 1049.0,
+      "entropy": 0.8735406622290611,
+      "epoch": 0.41766329346826125,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0036290446296334267,
+      "learning_rate": 1e-05,
+      "loss": 0.0412,
+      "num_tokens": 399373298.0,
+      "reward": 0.359375,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000042200088501,
+      "sampling/importance_sampling_ratio/min": 9.216561011271551e-05,
+      "sampling/sampling_logp_difference/max": 9.291923522949219,
+      "sampling/sampling_logp_difference/mean": 0.0201371181756258,
+      "step": 454
+    },
+    {
+      "clip_ratio/high_max": 3.4702664606811595e-05,
+      "clip_ratio/high_mean": 8.675666151702899e-06,
+      "clip_ratio/low_mean": 3.3217100849469716e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.189276808119757e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14737.0,
+      "completions/mean_length": 6891.078125,
+      "completions/mean_terminated_length": 6663.24853515625,
+      "completions/min_length": 827.0,
+      "completions/min_terminated_length": 827.0,
+      "entropy": 0.8689641878008842,
+      "epoch": 0.41858325666973323,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004067540634423494,
+      "learning_rate": 1e-05,
+      "loss": 0.0633,
+      "num_tokens": 400273708.0,
+      "reward": 0.484375,
+      "reward_std": 0.27274850010871887,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999425411224365,
+      "sampling/importance_sampling_ratio/min": 4.0002717582865444e-07,
+      "sampling/sampling_logp_difference/max": 14.731733322143555,
+      "sampling/sampling_logp_difference/mean": 0.019800148904323578,
+      "step": 455
+    },
+    {
+      "clip_ratio/high_max": 2.939170826721238e-06,
+      "clip_ratio/high_mean": 7.347927066803095e-07,
+      "clip_ratio/low_mean": 3.564125790944672e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6376050502440194e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15234.0,
+      "completions/mean_length": 6899.3515625,
+      "completions/mean_terminated_length": 6748.8017578125,
+      "completions/min_length": 1149.0,
+      "completions/min_terminated_length": 1149.0,
+      "entropy": 0.9442604705691338,
+      "epoch": 0.41950321987120515,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0026191689539700747,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "num_tokens": 401177497.0,
+      "reward": 0.46875,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 0.0017910725437104702,
+      "sampling/sampling_logp_difference/max": 6.3249406814575195,
+      "sampling/sampling_logp_difference/mean": 0.021380646154284477,
+      "step": 456
+    },
+    {
+      "clip_ratio/high_max": 8.99604128790088e-06,
+      "clip_ratio/high_mean": 2.24901032197522e-06,
+      "clip_ratio/low_mean": 2.57235833487357e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.797259367071092e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16226.0,
+      "completions/mean_length": 7175.8359375,
+      "completions/mean_terminated_length": 7029.6748046875,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.8653769046068192,
+      "epoch": 0.4204231830726771,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003141516586765647,
+      "learning_rate": 1e-05,
+      "loss": 0.0674,
+      "num_tokens": 402115812.0,
+      "reward": 0.4375,
+      "reward_std": 0.21040895581245422,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999862909317017,
+      "sampling/importance_sampling_ratio/min": 0.001265019178390503,
+      "sampling/sampling_logp_difference/max": 6.672667980194092,
+      "sampling/sampling_logp_difference/mean": 0.01970163732767105,
+      "step": 457
+    },
+    {
+      "clip_ratio/high_max": 1.0800059499160852e-05,
+      "clip_ratio/high_mean": 2.700014874790213e-06,
+      "clip_ratio/low_mean": 3.116219727417047e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3862211807900167e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16250.0,
+      "completions/mean_length": 7090.8515625,
+      "completions/mean_terminated_length": 6791.072265625,
+      "completions/min_length": 606.0,
+      "completions/min_terminated_length": 606.0,
+      "entropy": 0.9437825232744217,
+      "epoch": 0.42134314627414904,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.001980370609089732,
+      "learning_rate": 1e-05,
+      "loss": 0.0751,
+      "num_tokens": 403048385.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999619722366333,
+      "sampling/importance_sampling_ratio/min": 1.4011449138706666e-06,
+      "sampling/sampling_logp_difference/max": 13.47822093963623,
+      "sampling/sampling_logp_difference/mean": 0.021090596914291382,
+      "step": 458
+    },
+    {
+      "clip_ratio/high_max": 2.5482850560365478e-05,
+      "clip_ratio/high_mean": 6.370712640091369e-06,
+      "clip_ratio/low_mean": 4.8558076969129615e-05,
+      "clip_ratio/low_min": 4.8952420002024155e-06,
+      "clip_ratio/region_mean": 5.4928788131292094e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16175.0,
+      "completions/mean_length": 7033.65625,
+      "completions/mean_terminated_length": 6809.24853515625,
+      "completions/min_length": 1007.0,
+      "completions/min_terminated_length": 1007.0,
+      "entropy": 0.8789731040596962,
+      "epoch": 0.42226310947562096,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003833206370472908,
+      "learning_rate": 1e-05,
+      "loss": 0.059,
+      "num_tokens": 403968037.0,
+      "reward": 0.46875,
+      "reward_std": 0.28460076451301575,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000317096710205,
+      "sampling/importance_sampling_ratio/min": 0.0021942879538983107,
+      "sampling/sampling_logp_difference/max": 6.1218976974487305,
+      "sampling/sampling_logp_difference/mean": 0.019913772121071815,
+      "step": 459
+    },
+    {
+      "clip_ratio/high_max": 4.068877842655638e-06,
+      "clip_ratio/high_mean": 1.0172194606639096e-06,
+      "clip_ratio/low_mean": 6.774969961043098e-05,
+      "clip_ratio/low_min": 3.189914878021227e-06,
+      "clip_ratio/region_mean": 6.876691895740805e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16107.0,
+      "completions/mean_length": 6992.8984375,
+      "completions/mean_terminated_length": 6611.14599609375,
+      "completions/min_length": 754.0,
+      "completions/min_terminated_length": 754.0,
+      "entropy": 0.857115626335144,
+      "epoch": 0.42318307267709293,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005315023008733988,
+      "learning_rate": 1e-05,
+      "loss": 0.1581,
+      "num_tokens": 404881584.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3469353914260864,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000758171081543,
+      "sampling/importance_sampling_ratio/min": 4.546630952972919e-05,
+      "sampling/sampling_logp_difference/max": 9.998538970947266,
+      "sampling/sampling_logp_difference/mean": 0.01872519962489605,
+      "step": 460
+    },
+    {
+      "clip_ratio/high_max": 1.167047457784065e-05,
+      "clip_ratio/high_mean": 2.9176186444601626e-06,
+      "clip_ratio/low_mean": 3.3195502112448594e-05,
+      "clip_ratio/low_min": 5.25188033861923e-06,
+      "clip_ratio/region_mean": 3.611312064322192e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16095.0,
+      "completions/mean_length": 6623.2578125,
+      "completions/mean_terminated_length": 6226.4794921875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "entropy": 0.8803941905498505,
+      "epoch": 0.42410303587856485,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0074885934591293335,
+      "learning_rate": 1e-05,
+      "loss": 0.1076,
+      "num_tokens": 405749105.0,
+      "reward": 0.515625,
+      "reward_std": 0.25354722142219543,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999799728393555,
+      "sampling/importance_sampling_ratio/min": 0.0011723897187039256,
+      "sampling/sampling_logp_difference/max": 6.748711109161377,
+      "sampling/sampling_logp_difference/mean": 0.01930626854300499,
+      "step": 461
+    },
+    {
+      "clip_ratio/high_max": 4.11753080697963e-06,
+      "clip_ratio/high_mean": 1.0293827017449075e-06,
+      "clip_ratio/low_mean": 5.09268712676203e-05,
+      "clip_ratio/low_min": 1.1170248626513057e-05,
+      "clip_ratio/region_mean": 5.195625465148623e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15032.0,
+      "completions/mean_length": 7244.8203125,
+      "completions/mean_terminated_length": 6647.5419921875,
+      "completions/min_length": 1227.0,
+      "completions/min_terminated_length": 1227.0,
+      "entropy": 0.9202689751982689,
+      "epoch": 0.4250229990800368,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003960717935115099,
+      "learning_rate": 1e-05,
+      "loss": 0.0536,
+      "num_tokens": 406704618.0,
+      "reward": 0.484375,
+      "reward_std": 0.2880108058452606,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999812841415405,
+      "sampling/importance_sampling_ratio/min": 1.69715603988152e-05,
+      "sampling/sampling_logp_difference/max": 10.98397159576416,
+      "sampling/sampling_logp_difference/mean": 0.02019711770117283,
+      "step": 462
+    },
+    {
+      "clip_ratio/high_max": 2.874629831239872e-05,
+      "clip_ratio/high_mean": 1.0519701334033016e-05,
+      "clip_ratio/low_mean": 5.367962035052187e-05,
+      "clip_ratio/low_min": 6.5083827394119e-06,
+      "clip_ratio/region_mean": 6.419932219614566e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16296.0,
+      "completions/mean_length": 7462.0546875,
+      "completions/mean_terminated_length": 6867.2587890625,
+      "completions/min_length": 669.0,
+      "completions/min_terminated_length": 669.0,
+      "entropy": 0.8141553401947021,
+      "epoch": 0.42594296228150874,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003602087963372469,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 407677177.0,
+      "reward": 0.421875,
+      "reward_std": 0.35482609272003174,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999440312385559,
+      "sampling/importance_sampling_ratio/min": 0.0007806668290868402,
+      "sampling/sampling_logp_difference/max": 7.155362129211426,
+      "sampling/sampling_logp_difference/mean": 0.01856713369488716,
+      "step": 463
+    },
+    {
+      "clip_ratio/high_max": 2.6413443720230134e-05,
+      "clip_ratio/high_mean": 8.973188073468918e-06,
+      "clip_ratio/low_mean": 3.5997712757307454e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.497090230870526e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15750.0,
+      "completions/mean_length": 6683.1796875,
+      "completions/mean_terminated_length": 6529.19873046875,
+      "completions/min_length": 775.0,
+      "completions/min_terminated_length": 775.0,
+      "entropy": 0.9070071652531624,
+      "epoch": 0.42686292548298066,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004038481041789055,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 408552512.0,
+      "reward": 0.4609375,
+      "reward_std": 0.25620076060295105,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000439882278442,
+      "sampling/importance_sampling_ratio/min": 4.474630986806005e-05,
+      "sampling/sampling_logp_difference/max": 10.014501571655273,
+      "sampling/sampling_logp_difference/mean": 0.02077356167137623,
+      "step": 464
+    },
+    {
+      "clip_ratio/high_max": 1.7171289982798044e-05,
+      "clip_ratio/high_mean": 4.292822495699511e-06,
+      "clip_ratio/low_mean": 3.225401701456576e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.654683996501262e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15864.0,
+      "completions/mean_length": 6472.9453125,
+      "completions/mean_terminated_length": 5985.51611328125,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.8807859197258949,
+      "epoch": 0.42778288868445263,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004457853268831968,
+      "learning_rate": 1e-05,
+      "loss": 0.0295,
+      "num_tokens": 409399257.0,
+      "reward": 0.421875,
+      "reward_std": 0.20517179369926453,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999473690986633,
+      "sampling/importance_sampling_ratio/min": 0.0017577135004103184,
+      "sampling/sampling_logp_difference/max": 6.343741416931152,
+      "sampling/sampling_logp_difference/mean": 0.020475786179304123,
+      "step": 465
+    },
+    {
+      "clip_ratio/high_max": 5.442162637336878e-05,
+      "clip_ratio/high_mean": 1.584139977239829e-05,
+      "clip_ratio/low_mean": 5.706528349946893e-05,
+      "clip_ratio/low_min": 2.5156462925224332e-05,
+      "clip_ratio/region_mean": 7.290668463610928e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15896.0,
+      "completions/mean_length": 5989.78125,
+      "completions/mean_terminated_length": 5654.48388671875,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.8479711338877678,
+      "epoch": 0.42870285188592455,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0033953245729207993,
+      "learning_rate": 1e-05,
+      "loss": 0.1217,
+      "num_tokens": 410185645.0,
+      "reward": 0.5,
+      "reward_std": 0.3735082745552063,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999676942825317,
+      "sampling/importance_sampling_ratio/min": 1.781588616722729e-05,
+      "sampling/sampling_logp_difference/max": 10.935420036315918,
+      "sampling/sampling_logp_difference/mean": 0.017986344173550606,
+      "step": 466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.2673244681500364e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.2673244681500364e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16103.0,
+      "completions/mean_length": 8299.9453125,
+      "completions/mean_terminated_length": 8171.62744140625,
+      "completions/min_length": 1123.0,
+      "completions/min_terminated_length": 1123.0,
+      "entropy": 0.9363152608275414,
+      "epoch": 0.4296228150873965,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002381247701123357,
+      "learning_rate": 1e-05,
+      "loss": 0.0651,
+      "num_tokens": 411268974.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2477683573961258,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999679327011108,
+      "sampling/importance_sampling_ratio/min": 0.000553094083443284,
+      "sampling/sampling_logp_difference/max": 7.4999823570251465,
+      "sampling/sampling_logp_difference/mean": 0.021354343742132187,
+      "step": 467
+    },
+    {
+      "clip_ratio/high_max": 8.578695997130126e-06,
+      "clip_ratio/high_mean": 2.1446739992825314e-06,
+      "clip_ratio/low_mean": 2.84454882830687e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.059016239603807e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14838.0,
+      "completions/mean_length": 7434.0546875,
+      "completions/mean_terminated_length": 7219.25634765625,
+      "completions/min_length": 898.0,
+      "completions/min_terminated_length": 898.0,
+      "entropy": 0.981913685798645,
+      "epoch": 0.43054277828886844,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.006341467145830393,
+      "learning_rate": 1e-05,
+      "loss": -0.003,
+      "num_tokens": 412238117.0,
+      "reward": 0.390625,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000128746032715,
+      "sampling/importance_sampling_ratio/min": 0.0019304680172353983,
+      "sampling/sampling_logp_difference/max": 6.249992847442627,
+      "sampling/sampling_logp_difference/mean": 0.02139873616397381,
+      "step": 468
+    },
+    {
+      "clip_ratio/high_max": 1.7187987396027893e-05,
+      "clip_ratio/high_mean": 5.150076049176278e-06,
+      "clip_ratio/low_mean": 5.4699471832009294e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.9849548279089504e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15871.0,
+      "completions/mean_length": 7211.1796875,
+      "completions/mean_terminated_length": 7138.95263671875,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "entropy": 0.9307222217321396,
+      "epoch": 0.43146274149034036,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002621602965518832,
+      "learning_rate": 1e-05,
+      "loss": 0.0562,
+      "num_tokens": 413182860.0,
+      "reward": 0.3203125,
+      "reward_std": 0.34716784954071045,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999529123306274,
+      "sampling/importance_sampling_ratio/min": 5.1446182624204084e-05,
+      "sampling/sampling_logp_difference/max": 9.874974250793457,
+      "sampling/sampling_logp_difference/mean": 0.020250719040632248,
+      "step": 469
+    },
+    {
+      "clip_ratio/high_max": 1.0867412584047997e-05,
+      "clip_ratio/high_mean": 3.9217885614561965e-06,
+      "clip_ratio/low_mean": 4.7740833792886406e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.16626223543426e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15726.0,
+      "completions/mean_length": 5349.4296875,
+      "completions/mean_terminated_length": 5174.2783203125,
+      "completions/min_length": 983.0,
+      "completions/min_terminated_length": 983.0,
+      "entropy": 1.0213474333286285,
+      "epoch": 0.43238270469181234,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035241330042481422,
+      "learning_rate": 1e-05,
+      "loss": 0.0657,
+      "num_tokens": 413885963.0,
+      "reward": 0.3046875,
+      "reward_std": 0.25330984592437744,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999449253082275,
+      "sampling/importance_sampling_ratio/min": 0.0003569081309251487,
+      "sampling/sampling_logp_difference/max": 7.938032150268555,
+      "sampling/sampling_logp_difference/mean": 0.01975759118795395,
+      "step": 470
+    },
+    {
+      "clip_ratio/high_max": 1.469514609198086e-05,
+      "clip_ratio/high_mean": 3.673786522995215e-06,
+      "clip_ratio/low_mean": 2.699725871480041e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0671045237795624e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15357.0,
+      "completions/mean_length": 7542.8515625,
+      "completions/mean_terminated_length": 7257.65283203125,
+      "completions/min_length": 1359.0,
+      "completions/min_terminated_length": 1359.0,
+      "entropy": 0.8882969543337822,
+      "epoch": 0.43330266789328425,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0014164346503093839,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 414870560.0,
+      "reward": 0.3671875,
+      "reward_std": 0.20753081142902374,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000402927398682,
+      "sampling/importance_sampling_ratio/min": 6.435441900976002e-05,
+      "sampling/sampling_logp_difference/max": 9.651104927062988,
+      "sampling/sampling_logp_difference/mean": 0.020874422043561935,
+      "step": 471
+    },
+    {
+      "clip_ratio/high_max": 1.669827497607912e-05,
+      "clip_ratio/high_mean": 4.17456874401978e-06,
+      "clip_ratio/low_mean": 3.673103901746799e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.090560787517461e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7286.90625,
+      "completions/mean_terminated_length": 6993.451171875,
+      "completions/min_length": 977.0,
+      "completions/min_terminated_length": 977.0,
+      "entropy": 0.9254636988043785,
+      "epoch": 0.43422263109475623,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0026956009678542614,
+      "learning_rate": 1e-05,
+      "loss": 0.0567,
+      "num_tokens": 415825252.0,
+      "reward": 0.328125,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999917209148407,
+      "sampling/importance_sampling_ratio/min": 0.0019701423589140177,
+      "sampling/sampling_logp_difference/max": 6.229649543762207,
+      "sampling/sampling_logp_difference/mean": 0.0202642735093832,
+      "step": 472
+    },
+    {
+      "clip_ratio/high_max": 9.162045444099931e-06,
+      "clip_ratio/high_mean": 2.2905113610249828e-06,
+      "clip_ratio/low_mean": 3.818475033767754e-05,
+      "clip_ratio/low_min": 7.20606476534158e-06,
+      "clip_ratio/region_mean": 4.047526181238936e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15908.0,
+      "completions/mean_length": 7244.7421875,
+      "completions/mean_terminated_length": 6716.0244140625,
+      "completions/min_length": 1010.0,
+      "completions/min_terminated_length": 1010.0,
+      "entropy": 0.7817923128604889,
+      "epoch": 0.43514259429622815,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0022128887940198183,
+      "learning_rate": 1e-05,
+      "loss": 0.0577,
+      "num_tokens": 416774011.0,
+      "reward": 0.453125,
+      "reward_std": 0.2937847375869751,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002384185791,
+      "sampling/importance_sampling_ratio/min": 0.0015034435782581568,
+      "sampling/sampling_logp_difference/max": 6.499997138977051,
+      "sampling/sampling_logp_difference/mean": 0.01840684749186039,
+      "step": 473
+    },
+    {
+      "clip_ratio/high_max": 1.2232871313244686e-05,
+      "clip_ratio/high_mean": 3.0582178283111716e-06,
+      "clip_ratio/low_mean": 3.636896872194484e-05,
+      "clip_ratio/low_min": 3.1460788250115e-06,
+      "clip_ratio/region_mean": 3.9427186266038916e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16254.0,
+      "completions/mean_length": 9042.90625,
+      "completions/mean_terminated_length": 8283.482421875,
+      "completions/min_length": 997.0,
+      "completions/min_terminated_length": 997.0,
+      "entropy": 0.9306210279464722,
+      "epoch": 0.43606255749770007,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0034676652867347,
+      "learning_rate": 1e-05,
+      "loss": 0.0504,
+      "num_tokens": 417951311.0,
+      "reward": 0.265625,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999234080314636,
+      "sampling/importance_sampling_ratio/min": 0.0002641192404553294,
+      "sampling/sampling_logp_difference/max": 8.239109992980957,
+      "sampling/sampling_logp_difference/mean": 0.02112819254398346,
+      "step": 474
+    },
+    {
+      "clip_ratio/high_max": 2.5187824576278217e-05,
+      "clip_ratio/high_mean": 8.202394610634656e-06,
+      "clip_ratio/low_mean": 4.3606626604741905e-05,
+      "clip_ratio/low_min": 3.5752079838857753e-06,
+      "clip_ratio/region_mean": 5.1809020988002885e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15721.0,
+      "completions/mean_length": 6763.6328125,
+      "completions/mean_terminated_length": 6610.9287109375,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.9879302233457565,
+      "epoch": 0.43698252069917204,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0030218157917261124,
+      "learning_rate": 1e-05,
+      "loss": 0.0704,
+      "num_tokens": 418836184.0,
+      "reward": 0.484375,
+      "reward_std": 0.30091896653175354,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999898672103882,
+      "sampling/importance_sampling_ratio/min": 0.0003778560785576701,
+      "sampling/sampling_logp_difference/max": 7.880997180938721,
+      "sampling/sampling_logp_difference/mean": 0.021101050078868866,
+      "step": 475
+    },
+    {
+      "clip_ratio/high_max": 1.0644185749697499e-05,
+      "clip_ratio/high_mean": 2.6610464374243747e-06,
+      "clip_ratio/low_mean": 6.21261324340594e-05,
+      "clip_ratio/low_min": 3.6509140954876784e-06,
+      "clip_ratio/region_mean": 6.478717887148377e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15675.0,
+      "completions/mean_length": 6794.25,
+      "completions/mean_terminated_length": 6564.09619140625,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 1.0259138569235802,
+      "epoch": 0.43790248390064396,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002881827764213085,
+      "learning_rate": 1e-05,
+      "loss": 0.0592,
+      "num_tokens": 419726192.0,
+      "reward": 0.265625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999275207519531,
+      "sampling/importance_sampling_ratio/min": 9.217044407705544e-07,
+      "sampling/sampling_logp_difference/max": 13.897041320800781,
+      "sampling/sampling_logp_difference/mean": 0.0210823193192482,
+      "step": 476
+    },
+    {
+      "clip_ratio/high_max": 1.108860487875063e-05,
+      "clip_ratio/high_mean": 2.7721512196876574e-06,
+      "clip_ratio/low_mean": 4.70996876629215e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.9871839337356505e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14281.0,
+      "completions/max_terminated_length": 14281.0,
+      "completions/mean_length": 5648.2109375,
+      "completions/mean_terminated_length": 5648.2109375,
+      "completions/min_length": 935.0,
+      "completions/min_terminated_length": 935.0,
+      "entropy": 0.88894472271204,
+      "epoch": 0.43882244710211593,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00289533962495625,
+      "learning_rate": 1e-05,
+      "loss": 0.0484,
+      "num_tokens": 420468867.0,
+      "reward": 0.484375,
+      "reward_std": 0.2675113081932068,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998449087142944,
+      "sampling/importance_sampling_ratio/min": 0.001372925122268498,
+      "sampling/sampling_logp_difference/max": 6.590811729431152,
+      "sampling/sampling_logp_difference/mean": 0.018499158322811127,
+      "step": 477
+    },
+    {
+      "clip_ratio/high_max": 4.753574557980755e-06,
+      "clip_ratio/high_mean": 1.1883936394951888e-06,
+      "clip_ratio/low_mean": 2.4103785335682915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.5292179316238617e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15657.0,
+      "completions/mean_length": 6188.359375,
+      "completions/mean_terminated_length": 6026.52392578125,
+      "completions/min_length": 1085.0,
+      "completions/min_terminated_length": 1085.0,
+      "entropy": 0.8476063013076782,
+      "epoch": 0.43974241030358785,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.002749695209786296,
+      "learning_rate": 1e-05,
+      "loss": 0.0012,
+      "num_tokens": 421280881.0,
+      "reward": 0.3671875,
+      "reward_std": 0.15991678833961487,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796152114868,
+      "sampling/importance_sampling_ratio/min": 0.004578418098390102,
+      "sampling/sampling_logp_difference/max": 5.386401653289795,
+      "sampling/sampling_logp_difference/mean": 0.018456483259797096,
+      "step": 478
+    },
+    {
+      "clip_ratio/high_max": 4.1359915030625416e-05,
+      "clip_ratio/high_mean": 1.0339978757656354e-05,
+      "clip_ratio/low_mean": 4.786080125995795e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.8200780586048495e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15112.0,
+      "completions/mean_length": 6864.3515625,
+      "completions/mean_terminated_length": 6635.88037109375,
+      "completions/min_length": 1065.0,
+      "completions/min_terminated_length": 1065.0,
+      "entropy": 0.8666203916072845,
+      "epoch": 0.4406623735050598,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.005116373300552368,
+      "learning_rate": 1e-05,
+      "loss": 0.0347,
+      "num_tokens": 422177822.0,
+      "reward": 0.4453125,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545216560364,
+      "sampling/importance_sampling_ratio/min": 0.00020385721290949732,
+      "sampling/sampling_logp_difference/max": 8.498090744018555,
+      "sampling/sampling_logp_difference/mean": 0.01979806460440159,
+      "step": 479
+    },
+    {
+      "clip_ratio/high_max": 1.4544774558089557e-05,
+      "clip_ratio/high_mean": 3.6361936395223893e-06,
+      "clip_ratio/low_mean": 4.153812756158004e-05,
+      "clip_ratio/low_min": 3.606462769312202e-06,
+      "clip_ratio/region_mean": 4.51743208031985e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15841.0,
+      "completions/mean_length": 7023.828125,
+      "completions/mean_terminated_length": 6799.18408203125,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.9098334684967995,
+      "epoch": 0.44158233670653174,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0020944855641573668,
+      "learning_rate": 1e-05,
+      "loss": 0.019,
+      "num_tokens": 423096576.0,
+      "reward": 0.2734375,
+      "reward_std": 0.20858672261238098,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999480247497559,
+      "sampling/importance_sampling_ratio/min": 0.0027383591514080763,
+      "sampling/sampling_logp_difference/max": 5.900396347045898,
+      "sampling/sampling_logp_difference/mean": 0.020111342892050743,
+      "step": 480
+    },
+    {
+      "clip_ratio/high_max": 3.256236095694476e-05,
+      "clip_ratio/high_mean": 1.2372795026749372e-05,
+      "clip_ratio/low_mean": 5.0774355258909054e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.314715119515313e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15527.0,
+      "completions/mean_length": 6666.828125,
+      "completions/mean_terminated_length": 6512.587890625,
+      "completions/min_length": 872.0,
+      "completions/min_terminated_length": 872.0,
+      "entropy": 0.9162466824054718,
+      "epoch": 0.44250229990800366,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003897767048329115,
+      "learning_rate": 1e-05,
+      "loss": 0.1151,
+      "num_tokens": 423968050.0,
+      "reward": 0.46875,
+      "reward_std": 0.3527044653892517,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999406337738037,
+      "sampling/importance_sampling_ratio/min": 0.0031828521750867367,
+      "sampling/sampling_logp_difference/max": 5.7499775886535645,
+      "sampling/sampling_logp_difference/mean": 0.019923247396945953,
+      "step": 481
+    },
+    {
+      "clip_ratio/high_max": 1.5341902098953142e-05,
+      "clip_ratio/high_mean": 4.791600815678976e-06,
+      "clip_ratio/low_mean": 7.980174223121139e-05,
+      "clip_ratio/low_min": 2.6713308216130827e-05,
+      "clip_ratio/region_mean": 8.459334412691533e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16223.0,
+      "completions/mean_length": 7159.8046875,
+      "completions/mean_terminated_length": 7013.38916015625,
+      "completions/min_length": 1022.0,
+      "completions/min_terminated_length": 1022.0,
+      "entropy": 0.8444746807217598,
+      "epoch": 0.44342226310947563,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003038195427507162,
+      "learning_rate": 1e-05,
+      "loss": 0.042,
+      "num_tokens": 424902953.0,
+      "reward": 0.359375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999940037727356,
+      "sampling/importance_sampling_ratio/min": 7.431909580191132e-06,
+      "sampling/sampling_logp_difference/max": 11.809727668762207,
+      "sampling/sampling_logp_difference/mean": 0.019014043733477592,
+      "step": 482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.55851120666739e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.55851120666739e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14716.0,
+      "completions/mean_length": 6146.2109375,
+      "completions/mean_terminated_length": 6065.5986328125,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "entropy": 0.8365580290555954,
+      "epoch": 0.44434222631094755,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025550283025950193,
+      "learning_rate": 1e-05,
+      "loss": 0.0548,
+      "num_tokens": 425709212.0,
+      "reward": 0.5625,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000015497207642,
+      "sampling/importance_sampling_ratio/min": 0.0006884043687023222,
+      "sampling/sampling_logp_difference/max": 7.281134128570557,
+      "sampling/sampling_logp_difference/mean": 0.019193854182958603,
+      "step": 483
+    },
+    {
+      "clip_ratio/high_max": 2.4752349872869672e-05,
+      "clip_ratio/high_mean": 7.036488455014478e-06,
+      "clip_ratio/low_mean": 4.780410063176532e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.484058920046664e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16153.0,
+      "completions/mean_length": 6557.578125,
+      "completions/mean_terminated_length": 6321.744140625,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.8316832035779953,
+      "epoch": 0.4452621895124195,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005126865580677986,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 426566462.0,
+      "reward": 0.484375,
+      "reward_std": 0.27852246165275574,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999485015869141,
+      "sampling/importance_sampling_ratio/min": 2.7536634661373682e-05,
+      "sampling/sampling_logp_difference/max": 10.499993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01839536987245083,
+      "step": 484
+    },
+    {
+      "clip_ratio/high_max": 3.443571449679439e-05,
+      "clip_ratio/high_mean": 8.608928624198597e-06,
+      "clip_ratio/low_mean": 5.915772453590762e-05,
+      "clip_ratio/low_min": 1.7084812043322017e-05,
+      "clip_ratio/region_mean": 6.776665304641938e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16359.0,
+      "completions/mean_length": 7007.3203125,
+      "completions/mean_terminated_length": 6858.484375,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.8674142584204674,
+      "epoch": 0.44618215271389144,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004829525947570801,
+      "learning_rate": 1e-05,
+      "loss": 0.0753,
+      "num_tokens": 427480007.0,
+      "reward": 0.46875,
+      "reward_std": 0.3874102830886841,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998922944068909,
+      "sampling/importance_sampling_ratio/min": 0.00020170137577224523,
+      "sampling/sampling_logp_difference/max": 8.508722305297852,
+      "sampling/sampling_logp_difference/mean": 0.019586069509387016,
+      "step": 485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.539863354897534e-05,
+      "clip_ratio/low_min": 8.211341992137022e-06,
+      "clip_ratio/region_mean": 5.539863354897534e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14748.0,
+      "completions/mean_length": 7069.8828125,
+      "completions/mean_terminated_length": 6922.0400390625,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.9066255167126656,
+      "epoch": 0.44710211591536336,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003539952216669917,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "num_tokens": 428404968.0,
+      "reward": 0.5,
+      "reward_std": 0.3618982434272766,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999353885650635,
+      "sampling/importance_sampling_ratio/min": 0.00024052867956925184,
+      "sampling/sampling_logp_difference/max": 8.332671165466309,
+      "sampling/sampling_logp_difference/mean": 0.020427238196134567,
+      "step": 486
+    },
+    {
+      "clip_ratio/high_max": 1.6550495729461545e-05,
+      "clip_ratio/high_mean": 4.137623932365386e-06,
+      "clip_ratio/low_mean": 5.576918465521885e-05,
+      "clip_ratio/low_min": 1.2613936178240692e-05,
+      "clip_ratio/region_mean": 5.99068093833921e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15290.0,
+      "completions/max_terminated_length": 15290.0,
+      "completions/mean_length": 5586.6875,
+      "completions/mean_terminated_length": 5586.6875,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.9208655655384064,
+      "epoch": 0.44802207911683534,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0030504625756293535,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "num_tokens": 429137176.0,
+      "reward": 0.515625,
+      "reward_std": 0.3480040729045868,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999984502792358,
+      "sampling/importance_sampling_ratio/min": 0.0005498559912666678,
+      "sampling/sampling_logp_difference/max": 7.50585412979126,
+      "sampling/sampling_logp_difference/mean": 0.019396595656871796,
+      "step": 487
+    },
+    {
+      "clip_ratio/high_max": 3.3761509712348925e-05,
+      "clip_ratio/high_mean": 8.440377428087231e-06,
+      "clip_ratio/low_mean": 3.6384140912559815e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.482451868170756e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15404.0,
+      "completions/mean_length": 5266.265625,
+      "completions/mean_terminated_length": 4999.4404296875,
+      "completions/min_length": 492.0,
+      "completions/min_terminated_length": 492.0,
+      "entropy": 0.7884859293699265,
+      "epoch": 0.44894204231830726,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003902251599356532,
+      "learning_rate": 1e-05,
+      "loss": -0.0077,
+      "num_tokens": 429836026.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2948455810546875,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999457001686096,
+      "sampling/importance_sampling_ratio/min": 0.05675617232918739,
+      "sampling/sampling_logp_difference/max": 2.868990898132324,
+      "sampling/sampling_logp_difference/mean": 0.01770034246146679,
+      "step": 488
+    },
+    {
+      "clip_ratio/high_max": 2.2323702978610527e-05,
+      "clip_ratio/high_mean": 5.580925744652632e-06,
+      "clip_ratio/low_mean": 4.0199149452746497e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.578007497002545e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15754.0,
+      "completions/mean_length": 6398.53125,
+      "completions/mean_terminated_length": 6319.9052734375,
+      "completions/min_length": 699.0,
+      "completions/min_terminated_length": 699.0,
+      "entropy": 0.8982341960072517,
+      "epoch": 0.44986200551977923,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0024998660665005445,
+      "learning_rate": 1e-05,
+      "loss": 0.0508,
+      "num_tokens": 430673446.0,
+      "reward": 0.421875,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999797940254211,
+      "sampling/importance_sampling_ratio/min": 0.000612784584518522,
+      "sampling/sampling_logp_difference/max": 7.397497177124023,
+      "sampling/sampling_logp_difference/mean": 0.020521972328424454,
+      "step": 489
+    },
+    {
+      "clip_ratio/high_max": 3.1756624366607866e-05,
+      "clip_ratio/high_mean": 7.939156091651967e-06,
+      "clip_ratio/low_mean": 8.124458963720826e-05,
+      "clip_ratio/low_min": 1.2379174222587608e-05,
+      "clip_ratio/region_mean": 8.91837471499457e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14374.0,
+      "completions/mean_length": 6277.65625,
+      "completions/mean_terminated_length": 6198.07861328125,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.8139145970344543,
+      "epoch": 0.45078196872125115,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.00784115307033062,
+      "learning_rate": 1e-05,
+      "loss": 0.0798,
+      "num_tokens": 431497546.0,
+      "reward": 0.546875,
+      "reward_std": 0.37716054916381836,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999848484992981,
+      "sampling/importance_sampling_ratio/min": 0.0006267798598855734,
+      "sampling/sampling_logp_difference/max": 7.37491512298584,
+      "sampling/sampling_logp_difference/mean": 0.01836184598505497,
+      "step": 490
+    },
+    {
+      "clip_ratio/high_max": 8.875004823494237e-06,
+      "clip_ratio/high_mean": 2.2187512058735592e-06,
+      "clip_ratio/low_mean": 2.3825880248296016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.6044631454169576e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15903.0,
+      "completions/mean_length": 7708.59375,
+      "completions/mean_terminated_length": 7355.9345703125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 1.087083138525486,
+      "epoch": 0.45170193192272307,
+      "frac_reward_zero_std": 0.625,
+      "grad_norm": 0.004277343396097422,
+      "learning_rate": 1e-05,
+      "loss": 0.035,
+      "num_tokens": 432503414.0,
+      "reward": 0.2890625,
+      "reward_std": 0.1633366346359253,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999503493309021,
+      "sampling/importance_sampling_ratio/min": 1.2187546417408157e-05,
+      "sampling/sampling_logp_difference/max": 11.315095901489258,
+      "sampling/sampling_logp_difference/mean": 0.02224145457148552,
+      "step": 491
+    },
+    {
+      "clip_ratio/high_max": 6.384065272868611e-06,
+      "clip_ratio/high_mean": 1.5960163182171527e-06,
+      "clip_ratio/low_mean": 3.561227788395627e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.720829374742607e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15576.0,
+      "completions/mean_length": 7162.7109375,
+      "completions/mean_terminated_length": 6865.25,
+      "completions/min_length": 842.0,
+      "completions/min_terminated_length": 842.0,
+      "entropy": 0.9157010763883591,
+      "epoch": 0.45262189512419504,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.006278311368077993,
+      "learning_rate": 1e-05,
+      "loss": 0.0459,
+      "num_tokens": 433439137.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2227931171655655,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966561794281,
+      "sampling/importance_sampling_ratio/min": 0.0005532125360332429,
+      "sampling/sampling_logp_difference/max": 7.499768257141113,
+      "sampling/sampling_logp_difference/mean": 0.02123419940471649,
+      "step": 492
+    },
+    {
+      "clip_ratio/high_max": 2.846911434062349e-05,
+      "clip_ratio/high_mean": 8.656040449750435e-06,
+      "clip_ratio/low_mean": 5.1716241614485625e-05,
+      "clip_ratio/low_min": 3.601579010137357e-06,
+      "clip_ratio/region_mean": 6.037228104105452e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16123.0,
+      "completions/mean_length": 7388.90625,
+      "completions/mean_terminated_length": 7023.251953125,
+      "completions/min_length": 980.0,
+      "completions/min_terminated_length": 980.0,
+      "entropy": 0.7670486867427826,
+      "epoch": 0.45354185832566696,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005177734419703484,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "num_tokens": 434402045.0,
+      "reward": 0.3828125,
+      "reward_std": 0.37951958179473877,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999250769615173,
+      "sampling/importance_sampling_ratio/min": 0.0022511729039251804,
+      "sampling/sampling_logp_difference/max": 6.096303939819336,
+      "sampling/sampling_logp_difference/mean": 0.01827731542289257,
+      "step": 493
+    },
+    {
+      "clip_ratio/high_max": 2.1548471977439476e-05,
+      "clip_ratio/high_mean": 6.257203722270788e-06,
+      "clip_ratio/low_mean": 7.719641234871233e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 8.345361538886209e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15767.0,
+      "completions/mean_length": 6805.375,
+      "completions/mean_terminated_length": 6496.38671875,
+      "completions/min_length": 587.0,
+      "completions/min_terminated_length": 587.0,
+      "entropy": 0.8407405763864517,
+      "epoch": 0.45446182152713893,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032320048194378614,
+      "learning_rate": 1e-05,
+      "loss": 0.0662,
+      "num_tokens": 435292029.0,
+      "reward": 0.4296875,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999642372131348,
+      "sampling/importance_sampling_ratio/min": 6.679954094579443e-05,
+      "sampling/sampling_logp_difference/max": 9.613814353942871,
+      "sampling/sampling_logp_difference/mean": 0.018761277198791504,
+      "step": 494
+    },
+    {
+      "clip_ratio/high_max": 3.460495008766884e-06,
+      "clip_ratio/high_mean": 8.65123752191721e-07,
+      "clip_ratio/low_mean": 7.76378024056612e-05,
+      "clip_ratio/low_min": 1.7026316072588088e-05,
+      "clip_ratio/region_mean": 7.850292649891344e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15105.0,
+      "completions/mean_length": 5753.4140625,
+      "completions/mean_terminated_length": 5321.2763671875,
+      "completions/min_length": 946.0,
+      "completions/min_terminated_length": 946.0,
+      "entropy": 0.7848984077572823,
+      "epoch": 0.45538178472861085,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030854379292577505,
+      "learning_rate": 1e-05,
+      "loss": 0.0279,
+      "num_tokens": 436046842.0,
+      "reward": 0.578125,
+      "reward_std": 0.31405961513519287,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998626708984375,
+      "sampling/importance_sampling_ratio/min": 4.36544311810394e-09,
+      "sampling/sampling_logp_difference/max": 19.24954605102539,
+      "sampling/sampling_logp_difference/mean": 0.017733070999383926,
+      "step": 495
+    },
+    {
+      "clip_ratio/high_max": 1.7207588371093152e-05,
+      "clip_ratio/high_mean": 4.301897092773288e-06,
+      "clip_ratio/low_mean": 3.234025916754035e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.664215591925313e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15682.0,
+      "completions/mean_length": 6522.84375,
+      "completions/mean_terminated_length": 6445.19677734375,
+      "completions/min_length": 1062.0,
+      "completions/min_terminated_length": 1062.0,
+      "entropy": 1.0593653172254562,
+      "epoch": 0.4563017479300828,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003124243812635541,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "num_tokens": 436899638.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2706219553947449,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999418258666992,
+      "sampling/importance_sampling_ratio/min": 4.476920821616659e-06,
+      "sampling/sampling_logp_difference/max": 12.316575050354004,
+      "sampling/sampling_logp_difference/mean": 0.021180003881454468,
+      "step": 496
+    },
+    {
+      "clip_ratio/high_max": 1.1790433973146719e-05,
+      "clip_ratio/high_mean": 2.9476084932866797e-06,
+      "clip_ratio/low_mean": 2.8437304308681632e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.138491274512489e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14515.0,
+      "completions/mean_length": 6203.203125,
+      "completions/mean_terminated_length": 5874.7900390625,
+      "completions/min_length": 1017.0,
+      "completions/min_terminated_length": 1017.0,
+      "entropy": 0.8152795508503914,
+      "epoch": 0.45722171113155474,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005001795012503862,
+      "learning_rate": 1e-05,
+      "loss": 0.0817,
+      "num_tokens": 437713008.0,
+      "reward": 0.4296875,
+      "reward_std": 0.26143795251846313,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999101758003235,
+      "sampling/importance_sampling_ratio/min": 0.001757707679644227,
+      "sampling/sampling_logp_difference/max": 6.34374475479126,
+      "sampling/sampling_logp_difference/mean": 0.017751028761267662,
+      "step": 497
+    },
+    {
+      "clip_ratio/high_max": 1.3163793028070359e-05,
+      "clip_ratio/high_mean": 4.229499381835922e-06,
+      "clip_ratio/low_mean": 4.4599403963729856e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.882890357293945e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15423.0,
+      "completions/mean_length": 5975.5234375,
+      "completions/mean_terminated_length": 5725.72021484375,
+      "completions/min_length": 690.0,
+      "completions/min_terminated_length": 690.0,
+      "entropy": 0.8275932744145393,
+      "epoch": 0.45814167433302666,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005084732081741095,
+      "learning_rate": 1e-05,
+      "loss": 0.0759,
+      "num_tokens": 438495811.0,
+      "reward": 0.5390625,
+      "reward_std": 0.28513264656066895,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998699426651001,
+      "sampling/importance_sampling_ratio/min": 3.120788460364565e-05,
+      "sampling/sampling_logp_difference/max": 10.374839782714844,
+      "sampling/sampling_logp_difference/mean": 0.018671832978725433,
+      "step": 498
+    },
+    {
+      "clip_ratio/high_max": 3.229640242352616e-06,
+      "clip_ratio/high_mean": 8.07410060588154e-07,
+      "clip_ratio/low_mean": 3.0413870263146237e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1221280551108066e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16110.0,
+      "completions/max_terminated_length": 16110.0,
+      "completions/mean_length": 7019.59375,
+      "completions/mean_terminated_length": 7019.59375,
+      "completions/min_length": 1058.0,
+      "completions/min_terminated_length": 1058.0,
+      "entropy": 0.9266618490219116,
+      "epoch": 0.45906163753449863,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.002567912917584181,
+      "learning_rate": 1e-05,
+      "loss": 0.0282,
+      "num_tokens": 439413055.0,
+      "reward": 0.375,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000476837158203,
+      "sampling/importance_sampling_ratio/min": 0.0010315657127648592,
+      "sampling/sampling_logp_difference/max": 6.876677513122559,
+      "sampling/sampling_logp_difference/mean": 0.02012534812092781,
+      "step": 499
+    },
+    {
+      "clip_ratio/high_max": 1.8327779343962902e-05,
+      "clip_ratio/high_mean": 4.5819448359907256e-06,
+      "clip_ratio/low_mean": 4.08189575864526e-05,
+      "clip_ratio/low_min": 4.041122338094283e-06,
+      "clip_ratio/region_mean": 4.5400901854009135e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 7373.3203125,
+      "completions/mean_terminated_length": 7082.65283203125,
+      "completions/min_length": 854.0,
+      "completions/min_terminated_length": 854.0,
+      "entropy": 0.9383682310581207,
+      "epoch": 0.45998160073597055,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004862098954617977,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 440375128.0,
+      "reward": 0.4375,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999188780784607,
+      "sampling/importance_sampling_ratio/min": 0.0006883886526338756,
+      "sampling/sampling_logp_difference/max": 7.28115701675415,
+      "sampling/sampling_logp_difference/mean": 0.020596595481038094,
+      "step": 500
+    },
+    {
+      "clip_ratio/high_max": 1.650619151405408e-05,
+      "clip_ratio/high_mean": 4.12654787851352e-06,
+      "clip_ratio/low_mean": 6.364750265674957e-05,
+      "clip_ratio/low_min": 3.94595599573222e-06,
+      "clip_ratio/region_mean": 6.77740499668289e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16280.0,
+      "completions/mean_length": 5944.953125,
+      "completions/mean_terminated_length": 5862.755859375,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "entropy": 0.9130716845393181,
+      "epoch": 0.4609015639374425,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.003041388699784875,
+      "learning_rate": 1e-05,
+      "loss": 0.0316,
+      "num_tokens": 441156306.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3345639705657959,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999566078186035,
+      "sampling/importance_sampling_ratio/min": 0.0007685241289436817,
+      "sampling/sampling_logp_difference/max": 7.171038627624512,
+      "sampling/sampling_logp_difference/mean": 0.019817989319562912,
+      "step": 501
+    },
+    {
+      "clip_ratio/high_max": 2.9951792839710834e-05,
+      "clip_ratio/high_mean": 9.205811807078135e-06,
+      "clip_ratio/low_mean": 3.147234815514821e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0678160075913183e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16181.0,
+      "completions/mean_length": 6686.015625,
+      "completions/mean_terminated_length": 6609.6533203125,
+      "completions/min_length": 1018.0,
+      "completions/min_terminated_length": 1018.0,
+      "entropy": 0.8640913739800453,
+      "epoch": 0.46182152713891444,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005679543130099773,
+      "learning_rate": 1e-05,
+      "loss": 0.0306,
+      "num_tokens": 442032972.0,
+      "reward": 0.5546875,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 0.007731473073363304,
+      "sampling/sampling_logp_difference/max": 4.86245584487915,
+      "sampling/sampling_logp_difference/mean": 0.019738182425498962,
+      "step": 502
+    },
+    {
+      "clip_ratio/high_max": 3.0190597726686974e-05,
+      "clip_ratio/high_mean": 7.5476494316717435e-06,
+      "clip_ratio/low_mean": 3.858067566397949e-05,
+      "clip_ratio/low_min": 9.290916750614997e-06,
+      "clip_ratio/region_mean": 4.612832617567619e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15951.0,
+      "completions/mean_length": 6945.5,
+      "completions/mean_terminated_length": 6231.6640625,
+      "completions/min_length": 1031.0,
+      "completions/min_terminated_length": 1031.0,
+      "entropy": 0.8156519457697868,
+      "epoch": 0.46274149034038636,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006176612339913845,
+      "learning_rate": 1e-05,
+      "loss": 0.0756,
+      "num_tokens": 442940940.0,
+      "reward": 0.46875,
+      "reward_std": 0.29644322395324707,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999117851257324,
+      "sampling/importance_sampling_ratio/min": 0.00018278000061400235,
+      "sampling/sampling_logp_difference/max": 8.607227325439453,
+      "sampling/sampling_logp_difference/mean": 0.01836501806974411,
+      "step": 503
+    },
+    {
+      "clip_ratio/high_max": 2.2105000425653998e-05,
+      "clip_ratio/high_mean": 6.28071654773521e-06,
+      "clip_ratio/low_mean": 3.060894187001395e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.6889658531436e-05,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15847.0,
+      "completions/mean_length": 8068.5390625,
+      "completions/mean_terminated_length": 7363.8388671875,
+      "completions/min_length": 875.0,
+      "completions/min_terminated_length": 875.0,
+      "entropy": 0.8196670189499855,
+      "epoch": 0.46366145354185834,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0021770994644612074,
+      "learning_rate": 1e-05,
+      "loss": 0.0386,
+      "num_tokens": 443992041.0,
+      "reward": 0.4453125,
+      "reward_std": 0.30115634202957153,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999759197235107,
+      "sampling/importance_sampling_ratio/min": 0.0001795605494407937,
+      "sampling/sampling_logp_difference/max": 8.624998092651367,
+      "sampling/sampling_logp_difference/mean": 0.019003838300704956,
+      "step": 504
+    },
+    {
+      "clip_ratio/high_max": 1.287241002501105e-05,
+      "clip_ratio/high_mean": 3.2181025062527624e-06,
+      "clip_ratio/low_mean": 4.5685408849749365e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.89035115833758e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15168.0,
+      "completions/mean_length": 5209.140625,
+      "completions/mean_terminated_length": 5031.76220703125,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "entropy": 0.8851845487952232,
+      "epoch": 0.46458141674333026,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00788798462599516,
+      "learning_rate": 1e-05,
+      "loss": 0.063,
+      "num_tokens": 444679675.0,
+      "reward": 0.4609375,
+      "reward_std": 0.33220988512039185,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999796748161316,
+      "sampling/importance_sampling_ratio/min": 0.00025673024356365204,
+      "sampling/sampling_logp_difference/max": 8.267484664916992,
+      "sampling/sampling_logp_difference/mean": 0.018808994442224503,
+      "step": 505
+    },
+    {
+      "clip_ratio/high_max": 2.294301202709903e-05,
+      "clip_ratio/high_mean": 6.590465602585027e-06,
+      "clip_ratio/low_mean": 5.944662643742049e-05,
+      "clip_ratio/low_min": 8.106994755507912e-06,
+      "clip_ratio/region_mean": 6.603709243790945e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16259.0,
+      "completions/mean_length": 7558.8984375,
+      "completions/mean_terminated_length": 7274.21728515625,
+      "completions/min_length": 707.0,
+      "completions/min_terminated_length": 707.0,
+      "entropy": 1.003449946641922,
+      "epoch": 0.46550137994480223,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.004547314252704382,
+      "learning_rate": 1e-05,
+      "loss": 0.1586,
+      "num_tokens": 445668126.0,
+      "reward": 0.421875,
+      "reward_std": 0.42293959856033325,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999848484992981,
+      "sampling/importance_sampling_ratio/min": 0.00011622780584730208,
+      "sampling/sampling_logp_difference/max": 9.059958457946777,
+      "sampling/sampling_logp_difference/mean": 0.02099413052201271,
+      "step": 506
+    },
+    {
+      "clip_ratio/high_max": 2.1350435872591333e-05,
+      "clip_ratio/high_mean": 6.047981628398702e-06,
+      "clip_ratio/low_mean": 8.880347786544007e-05,
+      "clip_ratio/low_min": 9.06585455595632e-06,
+      "clip_ratio/region_mean": 9.485145938015194e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16137.0,
+      "completions/max_terminated_length": 16137.0,
+      "completions/mean_length": 6066.6015625,
+      "completions/mean_terminated_length": 6066.6015625,
+      "completions/min_length": 1107.0,
+      "completions/min_terminated_length": 1107.0,
+      "entropy": 0.8450648710131645,
+      "epoch": 0.46642134314627415,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004621773958206177,
+      "learning_rate": 1e-05,
+      "loss": 0.121,
+      "num_tokens": 446464587.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3537652790546417,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000154972076416,
+      "sampling/importance_sampling_ratio/min": 1.3950601896794979e-05,
+      "sampling/sampling_logp_difference/max": 11.179987907409668,
+      "sampling/sampling_logp_difference/mean": 0.018016980960965157,
+      "step": 507
+    },
+    {
+      "clip_ratio/high_max": 3.0534724828612525e-06,
+      "clip_ratio/high_mean": 7.633681207153131e-07,
+      "clip_ratio/low_mean": 2.149350007130124e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2256868305703392e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 6988.0234375,
+      "completions/mean_terminated_length": 6838.88134765625,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 1.0452716201543808,
+      "epoch": 0.46734130634774607,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004523546434938908,
+      "learning_rate": 1e-05,
+      "loss": 0.0396,
+      "num_tokens": 447381134.0,
+      "reward": 0.3515625,
+      "reward_std": 0.22567617893218994,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999901056289673,
+      "sampling/importance_sampling_ratio/min": 0.016167031601071358,
+      "sampling/sampling_logp_difference/max": 4.124781131744385,
+      "sampling/sampling_logp_difference/mean": 0.021812722086906433,
+      "step": 508
+    },
+    {
+      "clip_ratio/high_max": 5.58759120394825e-06,
+      "clip_ratio/high_mean": 1.3968978009870625e-06,
+      "clip_ratio/low_mean": 3.684896307731833e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.824586099199223e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12316.0,
+      "completions/max_terminated_length": 12316.0,
+      "completions/mean_length": 5948.5,
+      "completions/mean_terminated_length": 5948.5,
+      "completions/min_length": 1252.0,
+      "completions/min_terminated_length": 1252.0,
+      "entropy": 0.8241566568613052,
+      "epoch": 0.46826126954921804,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004002885892987251,
+      "learning_rate": 1e-05,
+      "loss": 0.0188,
+      "num_tokens": 448158014.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3124620020389557,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999228715896606,
+      "sampling/importance_sampling_ratio/min": 0.0008566387114115059,
+      "sampling/sampling_logp_difference/max": 7.062494277954102,
+      "sampling/sampling_logp_difference/mean": 0.018487900495529175,
+      "step": 509
+    },
+    {
+      "clip_ratio/high_max": 1.0490723752809572e-05,
+      "clip_ratio/high_mean": 3.439610338773491e-06,
+      "clip_ratio/low_mean": 3.973086239739132e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.3170473020381905e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16044.0,
+      "completions/mean_length": 7966.375,
+      "completions/mean_terminated_length": 7764.3525390625,
+      "completions/min_length": 660.0,
+      "completions/min_terminated_length": 660.0,
+      "entropy": 0.8868448063731194,
+      "epoch": 0.46918123275068996,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0019062751671299338,
+      "learning_rate": 1e-05,
+      "loss": 0.0787,
+      "num_tokens": 449197054.0,
+      "reward": 0.40625,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999173879623413,
+      "sampling/importance_sampling_ratio/min": 0.0001614262000657618,
+      "sampling/sampling_logp_difference/max": 8.731462478637695,
+      "sampling/sampling_logp_difference/mean": 0.020015282556414604,
+      "step": 510
+    },
+    {
+      "clip_ratio/high_max": 1.2195105682621943e-05,
+      "clip_ratio/high_mean": 3.0487764206554857e-06,
+      "clip_ratio/low_mean": 3.558348203114292e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8632259474979946e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16124.0,
+      "completions/mean_length": 6520.0234375,
+      "completions/mean_terminated_length": 6442.3544921875,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.9168323278427124,
+      "epoch": 0.47010119595216193,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00490277074277401,
+      "learning_rate": 1e-05,
+      "loss": 0.0547,
+      "num_tokens": 450050153.0,
+      "reward": 0.484375,
+      "reward_std": 0.3437528908252716,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998800754547119,
+      "sampling/importance_sampling_ratio/min": 4.4418397919798736e-06,
+      "sampling/sampling_logp_difference/max": 12.324441909790039,
+      "sampling/sampling_logp_difference/mean": 0.020178331062197685,
+      "step": 511
+    },
+    {
+      "clip_ratio/high_max": 7.95772848505294e-06,
+      "clip_ratio/high_mean": 1.989432121263235e-06,
+      "clip_ratio/low_mean": 3.363800146871654e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.562743381735345e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16252.0,
+      "completions/mean_length": 6614.5625,
+      "completions/mean_terminated_length": 6217.4306640625,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.8635925352573395,
+      "epoch": 0.47102115915363385,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003792276605963707,
+      "learning_rate": 1e-05,
+      "loss": -0.0023,
+      "num_tokens": 450915281.0,
+      "reward": 0.5,
+      "reward_std": 0.20069602131843567,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999154806137085,
+      "sampling/importance_sampling_ratio/min": 0.004489119164645672,
+      "sampling/sampling_logp_difference/max": 5.40609884262085,
+      "sampling/sampling_logp_difference/mean": 0.019233014434576035,
+      "step": 512
+    },
+    {
+      "clip_ratio/high_max": 1.6306271390931215e-05,
+      "clip_ratio/high_mean": 6.67555605105008e-06,
+      "clip_ratio/low_mean": 3.4846169796765025e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1521726302562456e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16206.0,
+      "completions/mean_length": 6458.5078125,
+      "completions/mean_terminated_length": 5970.36865234375,
+      "completions/min_length": 1025.0,
+      "completions/min_terminated_length": 1025.0,
+      "entropy": 0.8816124573349953,
+      "epoch": 0.47194112235510577,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0031763892620801926,
+      "learning_rate": 1e-05,
+      "loss": 0.0287,
+      "num_tokens": 451761322.0,
+      "reward": 0.4921875,
+      "reward_std": 0.282474160194397,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999036192893982,
+      "sampling/importance_sampling_ratio/min": 9.611394489184022e-05,
+      "sampling/sampling_logp_difference/max": 9.24997615814209,
+      "sampling/sampling_logp_difference/mean": 0.01935420371592045,
+      "step": 513
+    },
+    {
+      "clip_ratio/high_max": 7.861634912842419e-06,
+      "clip_ratio/high_mean": 3.0314158721012063e-06,
+      "clip_ratio/low_mean": 2.2518463538290234e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.554987941039144e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15928.0,
+      "completions/mean_length": 5844.03125,
+      "completions/mean_terminated_length": 5676.73046875,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "entropy": 0.9008020162582397,
+      "epoch": 0.47286108555657774,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004134794697165489,
+      "learning_rate": 1e-05,
+      "loss": 0.1094,
+      "num_tokens": 452526342.0,
+      "reward": 0.546875,
+      "reward_std": 0.28930899500846863,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999297857284546,
+      "sampling/importance_sampling_ratio/min": 0.00012955136480741203,
+      "sampling/sampling_logp_difference/max": 8.951433181762695,
+      "sampling/sampling_logp_difference/mean": 0.02013866975903511,
+      "step": 514
+    },
+    {
+      "clip_ratio/high_max": 1.2711160707112867e-05,
+      "clip_ratio/high_mean": 3.177790176778217e-06,
+      "clip_ratio/low_mean": 2.444096298859222e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.761875293799676e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16180.0,
+      "completions/mean_length": 6214.5859375,
+      "completions/mean_terminated_length": 6134.51171875,
+      "completions/min_length": 1096.0,
+      "completions/min_terminated_length": 1096.0,
+      "entropy": 0.9522949978709221,
+      "epoch": 0.47378104875804966,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022520655766129494,
+      "learning_rate": 1e-05,
+      "loss": 0.0193,
+      "num_tokens": 453343385.0,
+      "reward": 0.4921875,
+      "reward_std": 0.20623260736465454,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999879598617554,
+      "sampling/importance_sampling_ratio/min": 3.763851054827683e-05,
+      "sampling/sampling_logp_difference/max": 10.187482833862305,
+      "sampling/sampling_logp_difference/mean": 0.019947605207562447,
+      "step": 515
+    },
+    {
+      "clip_ratio/high_max": 5.724247012039996e-05,
+      "clip_ratio/high_mean": 1.431061753009999e-05,
+      "clip_ratio/low_mean": 3.371703428456385e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.8027652155724354e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14376.0,
+      "completions/mean_length": 7138.515625,
+      "completions/mean_terminated_length": 7065.71630859375,
+      "completions/min_length": 846.0,
+      "completions/min_terminated_length": 846.0,
+      "entropy": 0.8856206461787224,
+      "epoch": 0.47470101195952163,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004887089133262634,
+      "learning_rate": 1e-05,
+      "loss": 0.0609,
+      "num_tokens": 454275379.0,
+      "reward": 0.4609375,
+      "reward_std": 0.32035762071609497,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999544620513916,
+      "sampling/importance_sampling_ratio/min": 0.004931141622364521,
+      "sampling/sampling_logp_difference/max": 5.312184810638428,
+      "sampling/sampling_logp_difference/mean": 0.019449077546596527,
+      "step": 516
+    },
+    {
+      "clip_ratio/high_max": 1.5607688055752078e-05,
+      "clip_ratio/high_mean": 3.9019220139380195e-06,
+      "clip_ratio/low_mean": 4.936055870530254e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.326248106030107e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15855.0,
+      "completions/mean_length": 6077.796875,
+      "completions/mean_terminated_length": 5915.00830078125,
+      "completions/min_length": 954.0,
+      "completions/min_terminated_length": 954.0,
+      "entropy": 0.862022191286087,
+      "epoch": 0.47562097516099355,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003875041613355279,
+      "learning_rate": 1e-05,
+      "loss": 0.0366,
+      "num_tokens": 455076625.0,
+      "reward": 0.4921875,
+      "reward_std": 0.23933593928813934,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000392198562622,
+      "sampling/importance_sampling_ratio/min": 3.322543852846138e-05,
+      "sampling/sampling_logp_difference/max": 10.31219482421875,
+      "sampling/sampling_logp_difference/mean": 0.018907926976680756,
+      "step": 517
+    },
+    {
+      "clip_ratio/high_max": 1.0557040241110371e-05,
+      "clip_ratio/high_mean": 3.535163386914064e-06,
+      "clip_ratio/low_mean": 3.7409978290270374e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.0945141790871276e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15316.0,
+      "completions/max_terminated_length": 15316.0,
+      "completions/mean_length": 6211.65625,
+      "completions/mean_terminated_length": 6211.65625,
+      "completions/min_length": 1292.0,
+      "completions/min_terminated_length": 1292.0,
+      "entropy": 0.8835236355662346,
+      "epoch": 0.4765409383624655,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004288897849619389,
+      "learning_rate": 1e-05,
+      "loss": 0.0822,
+      "num_tokens": 455889693.0,
+      "reward": 0.53125,
+      "reward_std": 0.27145031094551086,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999270439147949,
+      "sampling/importance_sampling_ratio/min": 2.5614745027269237e-06,
+      "sampling/sampling_logp_difference/max": 12.874927520751953,
+      "sampling/sampling_logp_difference/mean": 0.01986120268702507,
+      "step": 518
+    },
+    {
+      "clip_ratio/high_max": 2.842265530489385e-06,
+      "clip_ratio/high_mean": 7.105663826223463e-07,
+      "clip_ratio/low_mean": 3.578249538804812e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.649306199804414e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16128.0,
+      "completions/mean_length": 7035.609375,
+      "completions/mean_terminated_length": 6962.0,
+      "completions/min_length": 762.0,
+      "completions/min_terminated_length": 762.0,
+      "entropy": 0.9033957049250603,
+      "epoch": 0.47746090156393745,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.004230308346450329,
+      "learning_rate": 1e-05,
+      "loss": 0.0311,
+      "num_tokens": 456809643.0,
+      "reward": 0.3203125,
+      "reward_std": 0.17282497882843018,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999722242355347,
+      "sampling/importance_sampling_ratio/min": 1.670435995038133e-05,
+      "sampling/sampling_logp_difference/max": 10.99984073638916,
+      "sampling/sampling_logp_difference/mean": 0.020262110978364944,
+      "step": 519
+    },
+    {
+      "clip_ratio/high_max": 3.539844283295679e-05,
+      "clip_ratio/high_mean": 9.844010264714598e-06,
+      "clip_ratio/low_mean": 2.8534720058814855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.837873060774655e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16241.0,
+      "completions/mean_length": 6557.40625,
+      "completions/mean_terminated_length": 6321.568359375,
+      "completions/min_length": 1136.0,
+      "completions/min_terminated_length": 1136.0,
+      "entropy": 0.8352414071559906,
+      "epoch": 0.47838086476540936,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0029154124204069376,
+      "learning_rate": 1e-05,
+      "loss": 0.0204,
+      "num_tokens": 457669431.0,
+      "reward": 0.4375,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000226497650146,
+      "sampling/importance_sampling_ratio/min": 5.8480534789850935e-05,
+      "sampling/sampling_logp_difference/max": 9.746816635131836,
+      "sampling/sampling_logp_difference/mean": 0.019474683329463005,
+      "step": 520
+    },
+    {
+      "clip_ratio/high_max": 6.400114170901361e-05,
+      "clip_ratio/high_mean": 1.917558859076962e-05,
+      "clip_ratio/low_mean": 5.166920755073079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.084479466357152e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15428.0,
+      "completions/mean_length": 6444.1328125,
+      "completions/mean_terminated_length": 6205.576171875,
+      "completions/min_length": 398.0,
+      "completions/min_terminated_length": 398.0,
+      "entropy": 0.7480100840330124,
+      "epoch": 0.47930082796688134,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0025195449125021696,
+      "learning_rate": 1e-05,
+      "loss": 0.0248,
+      "num_tokens": 458512648.0,
+      "reward": 0.515625,
+      "reward_std": 0.2585597634315491,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999996542930603,
+      "sampling/importance_sampling_ratio/min": 2.4302940801135264e-05,
+      "sampling/sampling_logp_difference/max": 10.624913215637207,
+      "sampling/sampling_logp_difference/mean": 0.01779567077755928,
+      "step": 521
+    },
+    {
+      "clip_ratio/high_max": 2.748944325503544e-06,
+      "clip_ratio/high_mean": 6.87236081375886e-07,
+      "clip_ratio/low_mean": 3.4855478702411347e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5542715181691165e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15868.0,
+      "completions/mean_length": 6615.234375,
+      "completions/mean_terminated_length": 6380.7841796875,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "entropy": 0.8428665772080421,
+      "epoch": 0.48022079116835326,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004339073318988085,
+      "learning_rate": 1e-05,
+      "loss": 0.0608,
+      "num_tokens": 459377790.0,
+      "reward": 0.5234375,
+      "reward_std": 0.31064465641975403,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999370574951172,
+      "sampling/importance_sampling_ratio/min": 0.00042492515058256686,
+      "sampling/sampling_logp_difference/max": 7.76359748840332,
+      "sampling/sampling_logp_difference/mean": 0.018815383315086365,
+      "step": 522
+    },
+    {
+      "clip_ratio/high_max": 2.2513844896820956e-05,
+      "clip_ratio/high_mean": 7.496596083456097e-06,
+      "clip_ratio/low_mean": 2.2591082483813807e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0087678169365972e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15239.0,
+      "completions/mean_length": 6200.3203125,
+      "completions/mean_terminated_length": 5955.912109375,
+      "completions/min_length": 1032.0,
+      "completions/min_terminated_length": 1032.0,
+      "entropy": 0.9044734612107277,
+      "epoch": 0.48114075436982523,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005003004334867001,
+      "learning_rate": 1e-05,
+      "loss": 0.0502,
+      "num_tokens": 460189823.0,
+      "reward": 0.484375,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999645948410034,
+      "sampling/importance_sampling_ratio/min": 0.005019097588956356,
+      "sampling/sampling_logp_difference/max": 5.2945051193237305,
+      "sampling/sampling_logp_difference/mean": 0.0192951001226902,
+      "step": 523
+    },
+    {
+      "clip_ratio/high_max": 1.9086801785306307e-05,
+      "clip_ratio/high_mean": 4.771700446326577e-06,
+      "clip_ratio/low_mean": 3.145246773783583e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.622416772941506e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15706.0,
+      "completions/max_terminated_length": 15706.0,
+      "completions/mean_length": 5758.9140625,
+      "completions/mean_terminated_length": 5758.9140625,
+      "completions/min_length": 1181.0,
+      "completions/min_terminated_length": 1181.0,
+      "entropy": 0.8783154934644699,
+      "epoch": 0.48206071757129715,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005491400603204966,
+      "learning_rate": 1e-05,
+      "loss": 0.0209,
+      "num_tokens": 460944164.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2330428510904312,
+      "rewards/accuracy_reward/mean": 0.5859375,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999998807907104,
+      "sampling/importance_sampling_ratio/min": 0.003907227888703346,
+      "sampling/sampling_logp_difference/max": 5.54492712020874,
+      "sampling/sampling_logp_difference/mean": 0.019315458834171295,
+      "step": 524
+    },
+    {
+      "clip_ratio/high_max": 1.5554858691757545e-05,
+      "clip_ratio/high_mean": 3.888714672939386e-06,
+      "clip_ratio/low_mean": 9.616303373150004e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.3505018273463065e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15536.0,
+      "completions/mean_length": 7573.375,
+      "completions/mean_terminated_length": 7504.0,
+      "completions/min_length": 1579.0,
+      "completions/min_terminated_length": 1579.0,
+      "entropy": 1.057753436267376,
+      "epoch": 0.48298068077276907,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.0038622859865427017,
+      "learning_rate": 1e-05,
+      "loss": 0.0103,
+      "num_tokens": 461931916.0,
+      "reward": 0.3125,
+      "reward_std": 0.14123955368995667,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999769926071167,
+      "sampling/importance_sampling_ratio/min": 0.002133321948349476,
+      "sampling/sampling_logp_difference/max": 6.1500749588012695,
+      "sampling/sampling_logp_difference/mean": 0.02145528793334961,
+      "step": 525
+    },
+    {
+      "clip_ratio/high_max": 2.2185531634022482e-05,
+      "clip_ratio/high_mean": 6.324094329102081e-06,
+      "clip_ratio/low_mean": 4.7102344296945375e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.342643908079481e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14553.0,
+      "completions/mean_length": 7353.0703125,
+      "completions/mean_terminated_length": 7136.328125,
+      "completions/min_length": 907.0,
+      "completions/min_terminated_length": 907.0,
+      "entropy": 0.9386680871248245,
+      "epoch": 0.48390064397424104,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002902502194046974,
+      "learning_rate": 1e-05,
+      "loss": 0.0506,
+      "num_tokens": 462894701.0,
+      "reward": 0.5234375,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999150037765503,
+      "sampling/importance_sampling_ratio/min": 0.00492977537214756,
+      "sampling/sampling_logp_difference/max": 5.312461853027344,
+      "sampling/sampling_logp_difference/mean": 0.021296534687280655,
+      "step": 526
+    },
+    {
+      "clip_ratio/high_max": 1.8664793969946913e-05,
+      "clip_ratio/high_mean": 4.666198492486728e-06,
+      "clip_ratio/low_mean": 5.111583186589996e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.578203035838669e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15851.0,
+      "completions/mean_length": 7280.953125,
+      "completions/mean_terminated_length": 6987.30615234375,
+      "completions/min_length": 1111.0,
+      "completions/min_terminated_length": 1111.0,
+      "entropy": 0.9424067437648773,
+      "epoch": 0.48482060717571296,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002602500608190894,
+      "learning_rate": 1e-05,
+      "loss": 0.0546,
+      "num_tokens": 463849087.0,
+      "reward": 0.3125,
+      "reward_std": 0.2290911078453064,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999302625656128,
+      "sampling/importance_sampling_ratio/min": 4.007156167062931e-05,
+      "sampling/sampling_logp_difference/max": 10.12484359741211,
+      "sampling/sampling_logp_difference/mean": 0.020630592480301857,
+      "step": 527
+    },
+    {
+      "clip_ratio/high_max": 3.77411461158772e-05,
+      "clip_ratio/high_mean": 1.0150766001970624e-05,
+      "clip_ratio/low_mean": 4.5688502041230095e-05,
+      "clip_ratio/low_min": 5.72383623875794e-06,
+      "clip_ratio/region_mean": 5.583926849794807e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14628.0,
+      "completions/max_terminated_length": 14628.0,
+      "completions/mean_length": 6520.6328125,
+      "completions/mean_terminated_length": 6520.6328125,
+      "completions/min_length": 1459.0,
+      "completions/min_terminated_length": 1459.0,
+      "entropy": 0.8501213267445564,
+      "epoch": 0.48574057037718493,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005743890535086393,
+      "learning_rate": 1e-05,
+      "loss": 0.1494,
+      "num_tokens": 464704336.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3413938879966736,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999988079071045,
+      "sampling/importance_sampling_ratio/min": 5.838880315423012e-05,
+      "sampling/sampling_logp_difference/max": 9.74838638305664,
+      "sampling/sampling_logp_difference/mean": 0.018370801582932472,
+      "step": 528
+    },
+    {
+      "clip_ratio/high_max": 9.150254300038796e-06,
+      "clip_ratio/high_mean": 2.287563575009699e-06,
+      "clip_ratio/low_mean": 2.1804387529300584e-05,
+      "clip_ratio/low_min": 3.918126822100021e-06,
+      "clip_ratio/region_mean": 2.4091951559057634e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14675.0,
+      "completions/max_terminated_length": 14675.0,
+      "completions/mean_length": 7111.0,
+      "completions/mean_terminated_length": 7111.0,
+      "completions/min_length": 1288.0,
+      "completions/min_terminated_length": 1288.0,
+      "entropy": 0.8829544633626938,
+      "epoch": 0.48666053357865685,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004826955031603575,
+      "learning_rate": 1e-05,
+      "loss": 0.0967,
+      "num_tokens": 465632152.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2975040376186371,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999524354934692,
+      "sampling/importance_sampling_ratio/min": 0.00011604782775975764,
+      "sampling/sampling_logp_difference/max": 9.061508178710938,
+      "sampling/sampling_logp_difference/mean": 0.019976403564214706,
+      "step": 529
+    },
+    {
+      "clip_ratio/high_max": 2.3185014015325578e-05,
+      "clip_ratio/high_mean": 7.603994390592561e-06,
+      "clip_ratio/low_mean": 4.392900382299558e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.153299889570917e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15132.0,
+      "completions/mean_length": 7797.7109375,
+      "completions/mean_terminated_length": 7448.67431640625,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.9747610911726952,
+      "epoch": 0.48758049678012877,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0028944616205990314,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 466648507.0,
+      "reward": 0.390625,
+      "reward_std": 0.26303553581237793,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999991774559021,
+      "sampling/importance_sampling_ratio/min": 0.0002612585376482457,
+      "sampling/sampling_logp_difference/max": 8.25,
+      "sampling/sampling_logp_difference/mean": 0.020830729976296425,
+      "step": 530
+    },
+    {
+      "clip_ratio/high_max": 1.4947459476388758e-05,
+      "clip_ratio/high_mean": 3.7368648690971895e-06,
+      "clip_ratio/low_mean": 4.282657914700394e-05,
+      "clip_ratio/low_min": 4.545454430626705e-06,
+      "clip_ratio/region_mean": 4.656344435716164e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16160.0,
+      "completions/mean_length": 6395.4765625,
+      "completions/mean_terminated_length": 6316.82666015625,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "entropy": 0.9015842452645302,
+      "epoch": 0.48850045998160074,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003612271510064602,
+      "learning_rate": 1e-05,
+      "loss": 0.0573,
+      "num_tokens": 467487976.0,
+      "reward": 0.4921875,
+      "reward_std": 0.2664504945278168,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998596906661987,
+      "sampling/importance_sampling_ratio/min": 1.209868287332938e-06,
+      "sampling/sampling_logp_difference/max": 13.624999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01959329843521118,
+      "step": 531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 1.8946868863167765e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8946868863167765e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15694.0,
+      "completions/mean_length": 7298.78125,
+      "completions/mean_terminated_length": 7154.57177734375,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.9978953301906586,
+      "epoch": 0.48942042318307266,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002104024635627866,
+      "learning_rate": 1e-05,
+      "loss": 0.0104,
+      "num_tokens": 468445132.0,
+      "reward": 0.2890625,
+      "reward_std": 0.2301519513130188,
+      "rewards/accuracy_reward/mean": 0.2890625,
+      "rewards/accuracy_reward/std": 0.45510825514793396,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999783039093018,
+      "sampling/importance_sampling_ratio/min": 5.157754640094936e-05,
+      "sampling/sampling_logp_difference/max": 9.872424125671387,
+      "sampling/sampling_logp_difference/mean": 0.021517785266041756,
+      "step": 532
+    },
+    {
+      "clip_ratio/high_max": 2.0034196040796814e-05,
+      "clip_ratio/high_mean": 6.441706659643387e-06,
+      "clip_ratio/low_mean": 3.0451521752183908e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.689322829814046e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16003.0,
+      "completions/mean_length": 7021.53125,
+      "completions/mean_terminated_length": 6561.08154296875,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.9539581760764122,
+      "epoch": 0.49034038638454464,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0009346248698420823,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 469360760.0,
+      "reward": 0.375,
+      "reward_std": 0.20069600641727448,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999547004699707,
+      "sampling/importance_sampling_ratio/min": 0.0029978419188410044,
+      "sampling/sampling_logp_difference/max": 5.8098626136779785,
+      "sampling/sampling_logp_difference/mean": 0.020538944751024246,
+      "step": 533
+    },
+    {
+      "clip_ratio/high_max": 7.874939228713629e-06,
+      "clip_ratio/high_mean": 1.968734807178407e-06,
+      "clip_ratio/low_mean": 3.2224923302237585e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.419365827994625e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15370.0,
+      "completions/max_terminated_length": 15370.0,
+      "completions/mean_length": 6988.2109375,
+      "completions/mean_terminated_length": 6988.2109375,
+      "completions/min_length": 1047.0,
+      "completions/min_terminated_length": 1047.0,
+      "entropy": 0.9471191540360451,
+      "epoch": 0.49126034958601655,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002331435214728117,
+      "learning_rate": 1e-05,
+      "loss": 0.0522,
+      "num_tokens": 470274859.0,
+      "reward": 0.3203125,
+      "reward_std": 0.23751860857009888,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000002145767212,
+      "sampling/importance_sampling_ratio/min": 0.0015642779180780053,
+      "sampling/sampling_logp_difference/max": 6.460330963134766,
+      "sampling/sampling_logp_difference/mean": 0.02088295854628086,
+      "step": 534
+    },
+    {
+      "clip_ratio/high_max": 1.2364610256554442e-05,
+      "clip_ratio/high_mean": 3.0911525641386106e-06,
+      "clip_ratio/low_mean": 3.8229277151913266e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.132042954552162e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16212.0,
+      "completions/max_terminated_length": 16212.0,
+      "completions/mean_length": 7557.453125,
+      "completions/mean_terminated_length": 7557.453125,
+      "completions/min_length": 1064.0,
+      "completions/min_terminated_length": 1064.0,
+      "entropy": 0.9897207245230675,
+      "epoch": 0.4921803127874885,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004562230780720711,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 471263997.0,
+      "reward": 0.4765625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
+      "sampling/importance_sampling_ratio/min": 0.0001586318830959499,
+      "sampling/sampling_logp_difference/max": 8.748924255371094,
+      "sampling/sampling_logp_difference/mean": 0.02160259149968624,
+      "step": 535
+    },
+    {
+      "clip_ratio/high_max": 2.6050724500237266e-05,
+      "clip_ratio/high_mean": 7.420082738462952e-06,
+      "clip_ratio/low_mean": 5.8747830053107464e-05,
+      "clip_ratio/low_min": 1.3906133062846493e-05,
+      "clip_ratio/region_mean": 6.616791324631777e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15603.0,
+      "completions/mean_length": 6532.1953125,
+      "completions/mean_terminated_length": 6295.75244140625,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.9109068289399147,
+      "epoch": 0.49310027598896045,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.004525062162429094,
+      "learning_rate": 1e-05,
+      "loss": 0.0219,
+      "num_tokens": 472120622.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3487703502178192,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999650120735168,
+      "sampling/importance_sampling_ratio/min": 1.474883083574241e-05,
+      "sampling/sampling_logp_difference/max": 11.124346733093262,
+      "sampling/sampling_logp_difference/mean": 0.019527796655893326,
+      "step": 536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.90738064766083e-05,
+      "clip_ratio/low_min": 1.0626089533616323e-05,
+      "clip_ratio/region_mean": 3.90738064766083e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15011.0,
+      "completions/mean_length": 5994.40625,
+      "completions/mean_terminated_length": 5912.5986328125,
+      "completions/min_length": 531.0,
+      "completions/min_terminated_length": 531.0,
+      "entropy": 0.9276224821805954,
+      "epoch": 0.49402023919043236,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005058468785136938,
+      "learning_rate": 1e-05,
+      "loss": 0.0165,
+      "num_tokens": 472906346.0,
+      "reward": 0.421875,
+      "reward_std": 0.19044627249240875,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999456405639648,
+      "sampling/importance_sampling_ratio/min": 0.0005196271813474596,
+      "sampling/sampling_logp_difference/max": 7.562398910522461,
+      "sampling/sampling_logp_difference/mean": 0.020568232983350754,
+      "step": 537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.992188062009518e-05,
+      "clip_ratio/low_min": 1.2131874427723233e-05,
+      "clip_ratio/region_mean": 5.992188062009518e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15992.0,
+      "completions/mean_length": 6469.046875,
+      "completions/mean_terminated_length": 6311.6669921875,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "entropy": 0.9536962807178497,
+      "epoch": 0.49494020239190434,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.007286665495485067,
+      "learning_rate": 1e-05,
+      "loss": 0.1282,
+      "num_tokens": 473756256.0,
+      "reward": 0.3515625,
+      "reward_std": 0.35772189497947693,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000038146972656,
+      "sampling/importance_sampling_ratio/min": 6.244324322324246e-05,
+      "sampling/sampling_logp_difference/max": 9.681252479553223,
+      "sampling/sampling_logp_difference/mean": 0.019624462351202965,
+      "step": 538
+    },
+    {
+      "clip_ratio/high_max": 1.0018506145570427e-05,
+      "clip_ratio/high_mean": 2.504626536392607e-06,
+      "clip_ratio/low_mean": 3.329443018174061e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.57990563770727e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15383.0,
+      "completions/max_terminated_length": 15383.0,
+      "completions/mean_length": 5778.703125,
+      "completions/mean_terminated_length": 5778.703125,
+      "completions/min_length": 903.0,
+      "completions/min_terminated_length": 903.0,
+      "entropy": 0.9274095296859741,
+      "epoch": 0.49586016559337626,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0031439310405403376,
+      "learning_rate": 1e-05,
+      "loss": -0.0091,
+      "num_tokens": 474515194.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000576972961426,
+      "sampling/importance_sampling_ratio/min": 0.0006267410353757441,
+      "sampling/sampling_logp_difference/max": 7.374977111816406,
+      "sampling/sampling_logp_difference/mean": 0.019796252250671387,
+      "step": 539
+    },
+    {
+      "clip_ratio/high_max": 3.1761268928676145e-05,
+      "clip_ratio/high_mean": 9.23904565297562e-06,
+      "clip_ratio/low_mean": 4.140612338687788e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.064516949460085e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16146.0,
+      "completions/max_terminated_length": 16146.0,
+      "completions/mean_length": 6400.75,
+      "completions/mean_terminated_length": 6400.75,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 0.8927748426795006,
+      "epoch": 0.49678012879484823,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0039032045751810074,
+      "learning_rate": 1e-05,
+      "loss": 0.0938,
+      "num_tokens": 475355186.0,
+      "reward": 0.5546875,
+      "reward_std": 0.3135277032852173,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999880194664001,
+      "sampling/importance_sampling_ratio/min": 4.19893694925122e-06,
+      "sampling/sampling_logp_difference/max": 12.3806791305542,
+      "sampling/sampling_logp_difference/mean": 0.019878748804330826,
+      "step": 540
+    },
+    {
+      "clip_ratio/high_max": 2.524126966818585e-05,
+      "clip_ratio/high_mean": 7.227385253827379e-06,
+      "clip_ratio/low_mean": 5.609390495919797e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.332129100883321e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14347.0,
+      "completions/mean_length": 7150.234375,
+      "completions/mean_terminated_length": 6928.62451171875,
+      "completions/min_length": 1548.0,
+      "completions/min_terminated_length": 1548.0,
+      "entropy": 0.8632503524422646,
+      "epoch": 0.49770009199632015,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004979084711521864,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 476289752.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3369181156158447,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999991059303284,
+      "sampling/importance_sampling_ratio/min": 0.0004304716712795198,
+      "sampling/sampling_logp_difference/max": 7.75062894821167,
+      "sampling/sampling_logp_difference/mean": 0.019658904522657394,
+      "step": 541
+    },
+    {
+      "clip_ratio/high_max": 2.5298505988757825e-05,
+      "clip_ratio/high_mean": 6.324626497189456e-06,
+      "clip_ratio/low_mean": 3.922748987861269e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.555211648948898e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16353.0,
+      "completions/mean_length": 6855.6640625,
+      "completions/mean_terminated_length": 6704.4208984375,
+      "completions/min_length": 771.0,
+      "completions/min_terminated_length": 771.0,
+      "entropy": 0.8328540697693825,
+      "epoch": 0.49862005519779207,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003560611279681325,
+      "learning_rate": 1e-05,
+      "loss": 0.0332,
+      "num_tokens": 477186885.0,
+      "reward": 0.515625,
+      "reward_std": 0.2743411958217621,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998643398284912,
+      "sampling/importance_sampling_ratio/min": 0.00021035241661593318,
+      "sampling/sampling_logp_difference/max": 8.466726303100586,
+      "sampling/sampling_logp_difference/mean": 0.01880962960422039,
+      "step": 542
+    },
+    {
+      "clip_ratio/high_max": 8.90761498339998e-06,
+      "clip_ratio/high_mean": 2.226903745849995e-06,
+      "clip_ratio/low_mean": 5.487640487444878e-05,
+      "clip_ratio/low_min": 6.345177553157555e-06,
+      "clip_ratio/region_mean": 5.7103308108708006e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15880.0,
+      "completions/mean_length": 7117.1015625,
+      "completions/mean_terminated_length": 6818.1689453125,
+      "completions/min_length": 1067.0,
+      "completions/min_terminated_length": 1067.0,
+      "entropy": 0.9280833601951599,
+      "epoch": 0.49954001839926404,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037869063671678305,
+      "learning_rate": 1e-05,
+      "loss": 0.0773,
+      "num_tokens": 478121506.0,
+      "reward": 0.484375,
+      "reward_std": 0.2919674217700958,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999738931655884,
+      "sampling/importance_sampling_ratio/min": 3.256524507833092e-07,
+      "sampling/sampling_logp_difference/max": 14.937435150146484,
+      "sampling/sampling_logp_difference/mean": 0.0203043594956398,
+      "step": 543
+    },
+    {
+      "clip_ratio/high_max": 1.3482746680892888e-05,
+      "clip_ratio/high_mean": 3.370686670223222e-06,
+      "clip_ratio/low_mean": 3.976425330165512e-05,
+      "clip_ratio/low_min": 4.979286131856497e-06,
+      "clip_ratio/region_mean": 4.313493991503492e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16046.0,
+      "completions/mean_length": 6885.7109375,
+      "completions/mean_terminated_length": 6734.94482421875,
+      "completions/min_length": 1184.0,
+      "completions/min_terminated_length": 1184.0,
+      "entropy": 0.9137701392173767,
+      "epoch": 0.500459981600736,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002787451259791851,
+      "learning_rate": 1e-05,
+      "loss": 0.0847,
+      "num_tokens": 479021365.0,
+      "reward": 0.5,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000042915344238,
+      "sampling/importance_sampling_ratio/min": 0.0013747947523370385,
+      "sampling/sampling_logp_difference/max": 6.589450836181641,
+      "sampling/sampling_logp_difference/mean": 0.02060278132557869,
+      "step": 544
+    },
+    {
+      "clip_ratio/high_max": 2.918380459959735e-05,
+      "clip_ratio/high_mean": 8.077826691987866e-06,
+      "clip_ratio/low_mean": 4.93504342102824e-05,
+      "clip_ratio/low_min": 5.1258921303087845e-06,
+      "clip_ratio/region_mean": 5.742826124333078e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15047.0,
+      "completions/mean_length": 7055.7265625,
+      "completions/mean_terminated_length": 6982.275390625,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "entropy": 1.1009352952241898,
+      "epoch": 0.5013799448022079,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005555091425776482,
+      "learning_rate": 1e-05,
+      "loss": 0.0225,
+      "num_tokens": 479951778.0,
+      "reward": 0.28125,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.28125,
+      "rewards/accuracy_reward/std": 0.4513758420944214,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999507665634155,
+      "sampling/importance_sampling_ratio/min": 2.7657671353154e-07,
+      "sampling/sampling_logp_difference/max": 15.100777626037598,
+      "sampling/sampling_logp_difference/mean": 0.02176634594798088,
+      "step": 545
+    },
+    {
+      "clip_ratio/high_max": 9.75229158939328e-06,
+      "clip_ratio/high_mean": 2.43807289734832e-06,
+      "clip_ratio/low_mean": 3.58120408918694e-05,
+      "clip_ratio/low_min": 5.571651399804978e-06,
+      "clip_ratio/region_mean": 3.825011424396507e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16100.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 6088.2109375,
+      "completions/mean_terminated_length": 6088.2109375,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.7534168809652328,
+      "epoch": 0.5022999080036799,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.00568060576915741,
+      "learning_rate": 1e-05,
+      "loss": 0.1423,
+      "num_tokens": 480749677.0,
+      "reward": 0.6484375,
+      "reward_std": 0.3729842007160187,
+      "rewards/accuracy_reward/mean": 0.6484375,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999527931213379,
+      "sampling/importance_sampling_ratio/min": 0.0002166072663385421,
+      "sampling/sampling_logp_difference/max": 8.437424659729004,
+      "sampling/sampling_logp_difference/mean": 0.017093103379011154,
+      "step": 546
+    },
+    {
+      "clip_ratio/high_max": 1.821310434024781e-05,
+      "clip_ratio/high_mean": 4.5532760850619525e-06,
+      "clip_ratio/low_mean": 2.870424191314669e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.325751754346129e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16029.0,
+      "completions/mean_length": 5638.8515625,
+      "completions/mean_terminated_length": 5380.96826171875,
+      "completions/min_length": 1352.0,
+      "completions/min_terminated_length": 1352.0,
+      "entropy": 0.8868100792169571,
+      "epoch": 0.5032198712051518,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019015485886484385,
+      "learning_rate": 1e-05,
+      "loss": 0.1025,
+      "num_tokens": 481489954.0,
+      "reward": 0.59375,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999911904335022,
+      "sampling/importance_sampling_ratio/min": 0.0001796126161934808,
+      "sampling/sampling_logp_difference/max": 8.62470817565918,
+      "sampling/sampling_logp_difference/mean": 0.019102448597550392,
+      "step": 547
+    },
+    {
+      "clip_ratio/high_max": 2.3414544557454064e-05,
+      "clip_ratio/high_mean": 7.0229532411758555e-06,
+      "clip_ratio/low_mean": 3.169551814607985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8718471842003055e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15258.0,
+      "completions/mean_length": 6776.59375,
+      "completions/mean_terminated_length": 6624.095703125,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.9075161814689636,
+      "epoch": 0.5041398344066237,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.004203350283205509,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 482375358.0,
+      "reward": 0.453125,
+      "reward_std": 0.22567126154899597,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999104738235474,
+      "sampling/importance_sampling_ratio/min": 0.0036098493728786707,
+      "sampling/sampling_logp_difference/max": 5.6320695877075195,
+      "sampling/sampling_logp_difference/mean": 0.019327163696289062,
+      "step": 548
+    },
+    {
+      "clip_ratio/high_max": 1.8746226487564854e-05,
+      "clip_ratio/high_mean": 5.84939061809564e-06,
+      "clip_ratio/low_mean": 3.6077018648938974e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.192640903966094e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15684.0,
+      "completions/mean_length": 7507.59375,
+      "completions/mean_terminated_length": 7071.048828125,
+      "completions/min_length": 774.0,
+      "completions/min_terminated_length": 774.0,
+      "entropy": 0.8015655726194382,
+      "epoch": 0.5050597976080957,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004891456104815006,
+      "learning_rate": 1e-05,
+      "loss": 0.0499,
+      "num_tokens": 483357450.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2772369980812073,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999200701713562,
+      "sampling/importance_sampling_ratio/min": 0.0032753932755440474,
+      "sampling/sampling_logp_difference/max": 5.721317291259766,
+      "sampling/sampling_logp_difference/mean": 0.019086822867393494,
+      "step": 549
+    },
+    {
+      "clip_ratio/high_max": 2.4045971031227964e-05,
+      "clip_ratio/high_mean": 6.011492757806991e-06,
+      "clip_ratio/low_mean": 3.096040018135682e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.697189299600723e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16107.0,
+      "completions/mean_length": 6061.3125,
+      "completions/mean_terminated_length": 5813.568359375,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.8335569724440575,
+      "epoch": 0.5059797608095676,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003564947983250022,
+      "learning_rate": 1e-05,
+      "loss": 0.028,
+      "num_tokens": 484153554.0,
+      "reward": 0.3984375,
+      "reward_std": 0.26143792271614075,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999876022338867,
+      "sampling/importance_sampling_ratio/min": 0.02006213553249836,
+      "sampling/sampling_logp_difference/max": 3.908921003341675,
+      "sampling/sampling_logp_difference/mean": 0.018360145390033722,
+      "step": 550
+    },
+    {
+      "clip_ratio/high_max": 9.095339009945747e-06,
+      "clip_ratio/high_mean": 2.2738347524864366e-06,
+      "clip_ratio/low_mean": 4.612986276697484e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.840369865632965e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15957.0,
+      "completions/mean_length": 7312.4921875,
+      "completions/mean_terminated_length": 7241.06298828125,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.9900097697973251,
+      "epoch": 0.5068997240110396,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0032013265881687403,
+      "learning_rate": 1e-05,
+      "loss": 0.0976,
+      "num_tokens": 485111601.0,
+      "reward": 0.3125,
+      "reward_std": 0.21040895581245422,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999306201934814,
+      "sampling/importance_sampling_ratio/min": 0.006552733480930328,
+      "sampling/sampling_logp_difference/max": 5.0278730392456055,
+      "sampling/sampling_logp_difference/mean": 0.020712960511446,
+      "step": 551
+    },
+    {
+      "clip_ratio/high_max": 1.360053283860907e-05,
+      "clip_ratio/high_mean": 4.2937051603075815e-06,
+      "clip_ratio/low_mean": 4.3424448904261226e-05,
+      "clip_ratio/low_min": 4.718405762105249e-06,
+      "clip_ratio/region_mean": 4.771815429194248e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14797.0,
+      "completions/max_terminated_length": 14797.0,
+      "completions/mean_length": 6571.4453125,
+      "completions/mean_terminated_length": 6571.4453125,
+      "completions/min_length": 951.0,
+      "completions/min_terminated_length": 951.0,
+      "entropy": 0.8801060244441032,
+      "epoch": 0.5078196872125115,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.002972986316308379,
+      "learning_rate": 1e-05,
+      "loss": 0.0888,
+      "num_tokens": 485971554.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998995065689087,
+      "sampling/importance_sampling_ratio/min": 2.4590379325672984e-05,
+      "sampling/sampling_logp_difference/max": 10.613155364990234,
+      "sampling/sampling_logp_difference/mean": 0.020055105909705162,
+      "step": 552
+    },
+    {
+      "clip_ratio/high_max": 8.231255606006016e-06,
+      "clip_ratio/high_mean": 2.057813901501504e-06,
+      "clip_ratio/low_mean": 3.511405452627514e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.71718685983069e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16243.0,
+      "completions/mean_length": 6879.2890625,
+      "completions/mean_terminated_length": 6728.4208984375,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "entropy": 0.8452998399734497,
+      "epoch": 0.5087396504139834,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.00798189826309681,
+      "learning_rate": 1e-05,
+      "loss": 0.0278,
+      "num_tokens": 486873791.0,
+      "reward": 0.4609375,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493956565857,
+      "sampling/importance_sampling_ratio/min": 0.005210345610976219,
+      "sampling/sampling_logp_difference/max": 5.25710916519165,
+      "sampling/sampling_logp_difference/mean": 0.02010834403336048,
+      "step": 553
+    },
+    {
+      "clip_ratio/high_max": 1.757707786964602e-05,
+      "clip_ratio/high_mean": 4.394269467411505e-06,
+      "clip_ratio/low_mean": 6.0756912262149854e-05,
+      "clip_ratio/low_min": 1.0878021839744179e-05,
+      "clip_ratio/region_mean": 6.51511809337535e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16237.0,
+      "completions/max_terminated_length": 16237.0,
+      "completions/mean_length": 7169.8828125,
+      "completions/mean_terminated_length": 7169.8828125,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "entropy": 0.9671438857913017,
+      "epoch": 0.5096596136154554,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0038661460857838392,
+      "learning_rate": 1e-05,
+      "loss": 0.0389,
+      "num_tokens": 487814936.0,
+      "reward": 0.3359375,
+      "reward_std": 0.23751862347126007,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999849796295166,
+      "sampling/importance_sampling_ratio/min": 4.6830271458020434e-05,
+      "sampling/sampling_logp_difference/max": 9.96898078918457,
+      "sampling/sampling_logp_difference/mean": 0.02097059041261673,
+      "step": 554
+    },
+    {
+      "clip_ratio/high_max": 4.649260063160909e-06,
+      "clip_ratio/high_mean": 1.1623150157902273e-06,
+      "clip_ratio/low_mean": 3.180719090778439e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2969506037261453e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15458.0,
+      "completions/mean_length": 6945.0390625,
+      "completions/mean_terminated_length": 6870.71630859375,
+      "completions/min_length": 940.0,
+      "completions/min_terminated_length": 940.0,
+      "entropy": 0.9309702143073082,
+      "epoch": 0.5105795768169273,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002214127918705344,
+      "learning_rate": 1e-05,
+      "loss": 0.0252,
+      "num_tokens": 488720293.0,
+      "reward": 0.375,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914169311523,
+      "sampling/importance_sampling_ratio/min": 0.00032080389792099595,
+      "sampling/sampling_logp_difference/max": 8.04468059539795,
+      "sampling/sampling_logp_difference/mean": 0.01968962326645851,
+      "step": 555
+    },
+    {
+      "clip_ratio/high_max": 1.5428002825501608e-05,
+      "clip_ratio/high_mean": 3.857000706375402e-06,
+      "clip_ratio/low_mean": 5.9988536690980254e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.384553716998198e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16332.0,
+      "completions/mean_length": 5970.1015625,
+      "completions/mean_terminated_length": 5804.8017578125,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "entropy": 0.8274230882525444,
+      "epoch": 0.5114995400183993,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026088031008839607,
+      "learning_rate": 1e-05,
+      "loss": 0.0919,
+      "num_tokens": 489504626.0,
+      "reward": 0.484375,
+      "reward_std": 0.3237725496292114,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999892711639404,
+      "sampling/importance_sampling_ratio/min": 0.00033548183273524046,
+      "sampling/sampling_logp_difference/max": 7.999942779541016,
+      "sampling/sampling_logp_difference/mean": 0.018132124096155167,
+      "step": 556
+    },
+    {
+      "clip_ratio/high_max": 1.628765676287003e-05,
+      "clip_ratio/high_mean": 5.032566036788921e-06,
+      "clip_ratio/low_mean": 3.257978141846252e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.761234722787776e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15636.0,
+      "completions/mean_length": 7099.578125,
+      "completions/mean_terminated_length": 6952.20654296875,
+      "completions/min_length": 567.0,
+      "completions/min_terminated_length": 567.0,
+      "entropy": 0.8690815567970276,
+      "epoch": 0.5124195032198712,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0040014018304646015,
+      "learning_rate": 1e-05,
+      "loss": 0.0021,
+      "num_tokens": 490431156.0,
+      "reward": 0.4609375,
+      "reward_std": 0.25460803508758545,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999368786811829,
+      "sampling/importance_sampling_ratio/min": 0.0007102031959220767,
+      "sampling/sampling_logp_difference/max": 7.249959468841553,
+      "sampling/sampling_logp_difference/mean": 0.02036934345960617,
+      "step": 557
+    },
+    {
+      "clip_ratio/high_max": 1.3314914440343273e-05,
+      "clip_ratio/high_mean": 3.3287286100858182e-06,
+      "clip_ratio/low_mean": 3.747020150512981e-05,
+      "clip_ratio/low_min": 3.852436293527717e-06,
+      "clip_ratio/region_mean": 4.079892983099853e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16301.0,
+      "completions/mean_length": 7253.296875,
+      "completions/mean_terminated_length": 6725.07421875,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8692722395062447,
+      "epoch": 0.5133394664213431,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002252641599625349,
+      "learning_rate": 1e-05,
+      "loss": 0.0711,
+      "num_tokens": 491378450.0,
+      "reward": 0.328125,
+      "reward_std": 0.2488291710615158,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999855756759644,
+      "sampling/importance_sampling_ratio/min": 1.893525586638134e-05,
+      "sampling/sampling_logp_difference/max": 10.87448501586914,
+      "sampling/sampling_logp_difference/mean": 0.01926814392209053,
+      "step": 558
+    },
+    {
+      "clip_ratio/high_max": 3.51339258486405e-05,
+      "clip_ratio/high_mean": 1.0567253070803417e-05,
+      "clip_ratio/low_mean": 3.905345306520758e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.962070602232416e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15865.0,
+      "completions/mean_length": 7827.0234375,
+      "completions/mean_terminated_length": 7406.18798828125,
+      "completions/min_length": 808.0,
+      "completions/min_terminated_length": 808.0,
+      "entropy": 0.9718392416834831,
+      "epoch": 0.5142594296228151,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023995323572307825,
+      "learning_rate": 1e-05,
+      "loss": 0.0684,
+      "num_tokens": 492398757.0,
+      "reward": 0.3359375,
+      "reward_std": 0.26826781034469604,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999961256980896,
+      "sampling/importance_sampling_ratio/min": 0.0003522284678183496,
+      "sampling/sampling_logp_difference/max": 7.951230525970459,
+      "sampling/sampling_logp_difference/mean": 0.020725054666399956,
+      "step": 559
+    },
+    {
+      "clip_ratio/high_max": 9.237001677320222e-06,
+      "clip_ratio/high_mean": 2.3092504193300556e-06,
+      "clip_ratio/low_mean": 4.477454979223694e-05,
+      "clip_ratio/low_min": 3.5987793580716243e-06,
+      "clip_ratio/region_mean": 4.708380049578409e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 14833.0,
+      "completions/max_terminated_length": 14833.0,
+      "completions/mean_length": 6578.53125,
+      "completions/mean_terminated_length": 6578.53125,
+      "completions/min_length": 80.0,
+      "completions/min_terminated_length": 80.0,
+      "entropy": 0.9265799149870872,
+      "epoch": 0.515179392824287,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0053934333845973015,
+      "learning_rate": 1e-05,
+      "loss": 0.0298,
+      "num_tokens": 493259049.0,
+      "reward": 0.4140625,
+      "reward_std": 0.29196250438690186,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999976396560669,
+      "sampling/importance_sampling_ratio/min": 1.5993017541404697e-06,
+      "sampling/sampling_logp_difference/max": 13.345943450927734,
+      "sampling/sampling_logp_difference/mean": 0.019497254863381386,
+      "step": 560
+    },
+    {
+      "clip_ratio/high_max": 6.991247119003674e-06,
+      "clip_ratio/high_mean": 2.580789669082151e-06,
+      "clip_ratio/low_mean": 4.2538599473118666e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.511938891482714e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15783.0,
+      "completions/mean_length": 7893.7734375,
+      "completions/mean_terminated_length": 7826.92138671875,
+      "completions/min_length": 763.0,
+      "completions/min_terminated_length": 763.0,
+      "entropy": 0.9697273746132851,
+      "epoch": 0.516099356025759,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003773769596591592,
+      "learning_rate": 1e-05,
+      "loss": 0.061,
+      "num_tokens": 494288028.0,
+      "reward": 0.296875,
+      "reward_std": 0.29272884130477905,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000444650650024,
+      "sampling/importance_sampling_ratio/min": 4.6216489863581955e-05,
+      "sampling/sampling_logp_difference/max": 9.982173919677734,
+      "sampling/sampling_logp_difference/mean": 0.020743828266859055,
+      "step": 561
+    },
+    {
+      "clip_ratio/high_max": 1.060595786839258e-05,
+      "clip_ratio/high_mean": 4.29665919909894e-06,
+      "clip_ratio/low_mean": 3.2997783137034276e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.729444244982005e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15024.0,
+      "completions/mean_length": 6483.7734375,
+      "completions/mean_terminated_length": 6405.81884765625,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.8293593674898148,
+      "epoch": 0.5170193192272309,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.006334445904940367,
+      "learning_rate": 1e-05,
+      "loss": 0.0217,
+      "num_tokens": 495135903.0,
+      "reward": 0.5,
+      "reward_std": 0.20251333713531494,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999064207077026,
+      "sampling/importance_sampling_ratio/min": 0.0001236602693097666,
+      "sampling/sampling_logp_difference/max": 8.99797248840332,
+      "sampling/sampling_logp_difference/mean": 0.018669776618480682,
+      "step": 562
+    },
+    {
+      "clip_ratio/high_max": 9.357276894661481e-06,
+      "clip_ratio/high_mean": 2.3393192236653704e-06,
+      "clip_ratio/low_mean": 4.667806888392079e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.901738748230855e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16230.0,
+      "completions/mean_length": 6484.546875,
+      "completions/mean_terminated_length": 6246.96044921875,
+      "completions/min_length": 630.0,
+      "completions/min_terminated_length": 630.0,
+      "entropy": 0.7686850279569626,
+      "epoch": 0.5179392824287029,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.003286323742941022,
+      "learning_rate": 1e-05,
+      "loss": 0.0865,
+      "num_tokens": 495986277.0,
+      "reward": 0.59375,
+      "reward_std": 0.3763991594314575,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999945342540741,
+      "sampling/importance_sampling_ratio/min": 2.0216441043885425e-05,
+      "sampling/sampling_logp_difference/max": 10.809014320373535,
+      "sampling/sampling_logp_difference/mean": 0.018656805157661438,
+      "step": 563
+    },
+    {
+      "clip_ratio/high_max": 3.368905208844808e-05,
+      "clip_ratio/high_mean": 9.76577109668142e-06,
+      "clip_ratio/low_mean": 8.26880966542376e-06,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8034580989478854e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15720.0,
+      "completions/mean_length": 6411.3203125,
+      "completions/mean_terminated_length": 5746.47509765625,
+      "completions/min_length": 952.0,
+      "completions/min_terminated_length": 952.0,
+      "entropy": 0.899998240172863,
+      "epoch": 0.5188592456301748,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005072349216789007,
+      "learning_rate": 1e-05,
+      "loss": -0.0049,
+      "num_tokens": 496826094.0,
+      "reward": 0.515625,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999135732650757,
+      "sampling/importance_sampling_ratio/min": 0.0038024066016077995,
+      "sampling/sampling_logp_difference/max": 5.5721211433410645,
+      "sampling/sampling_logp_difference/mean": 0.019648944959044456,
+      "step": 564
+    },
+    {
+      "clip_ratio/high_max": 1.726673963275971e-05,
+      "clip_ratio/high_mean": 6.2551004020861e-06,
+      "clip_ratio/low_mean": 4.834715275592316e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.4602252930635586e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16300.0,
+      "completions/mean_length": 7110.0546875,
+      "completions/mean_terminated_length": 6810.89501953125,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 1.0061073675751686,
+      "epoch": 0.5197792088316467,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.005030680447816849,
+      "learning_rate": 1e-05,
+      "loss": 0.0871,
+      "num_tokens": 497756469.0,
+      "reward": 0.375,
+      "reward_std": 0.3253750801086426,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999985933303833,
+      "sampling/importance_sampling_ratio/min": 0.0004307488852646202,
+      "sampling/sampling_logp_difference/max": 7.749985218048096,
+      "sampling/sampling_logp_difference/mean": 0.02187274768948555,
+      "step": 565
+    },
+    {
+      "clip_ratio/high_max": 3.3920382520591374e-06,
+      "clip_ratio/high_mean": 8.480095630147844e-07,
+      "clip_ratio/low_mean": 2.627351494766117e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.712152416961544e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16100.0,
+      "completions/mean_length": 7546.484375,
+      "completions/mean_terminated_length": 7261.40283203125,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "entropy": 0.898541085422039,
+      "epoch": 0.5206991720331187,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002894402015954256,
+      "learning_rate": 1e-05,
+      "loss": -0.0016,
+      "num_tokens": 498743411.0,
+      "reward": 0.25,
+      "reward_std": 0.2380426526069641,
+      "rewards/accuracy_reward/mean": 0.25,
+      "rewards/accuracy_reward/std": 0.434714138507843,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998988509178162,
+      "sampling/importance_sampling_ratio/min": 3.340166585985571e-05,
+      "sampling/sampling_logp_difference/max": 10.306904792785645,
+      "sampling/sampling_logp_difference/mean": 0.019597206264734268,
+      "step": 566
+    },
+    {
+      "clip_ratio/high_max": 3.407480107853189e-06,
+      "clip_ratio/high_mean": 8.518700269632973e-07,
+      "clip_ratio/low_mean": 1.9815101950371172e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.066697197733447e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15426.0,
+      "completions/mean_length": 6637.9296875,
+      "completions/mean_terminated_length": 6241.74755859375,
+      "completions/min_length": 340.0,
+      "completions/min_terminated_length": 340.0,
+      "entropy": 0.9469815120100975,
+      "epoch": 0.5216191352345906,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0033100086729973555,
+      "learning_rate": 1e-05,
+      "loss": 0.0352,
+      "num_tokens": 499612490.0,
+      "reward": 0.4375,
+      "reward_std": 0.2782978415489197,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999792575836182,
+      "sampling/importance_sampling_ratio/min": 0.000214192972634919,
+      "sampling/sampling_logp_difference/max": 8.448633193969727,
+      "sampling/sampling_logp_difference/mean": 0.019627269357442856,
+      "step": 567
+    },
+    {
+      "clip_ratio/high_max": 2.8962323767700582e-05,
+      "clip_ratio/high_mean": 7.2405809419251455e-06,
+      "clip_ratio/low_mean": 6.551078422489809e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.275136522366665e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15136.0,
+      "completions/mean_length": 6903.0859375,
+      "completions/mean_terminated_length": 6752.595703125,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.976447619497776,
+      "epoch": 0.5225390984360626,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.006571728736162186,
+      "learning_rate": 1e-05,
+      "loss": 0.0543,
+      "num_tokens": 500515117.0,
+      "reward": 0.40625,
+      "reward_std": 0.3335031569004059,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999945163726807,
+      "sampling/importance_sampling_ratio/min": 0.016446342691779137,
+      "sampling/sampling_logp_difference/max": 4.107652187347412,
+      "sampling/sampling_logp_difference/mean": 0.020653847604990005,
+      "step": 568
+    },
+    {
+      "clip_ratio/high_max": 1.4576415196643211e-05,
+      "clip_ratio/high_mean": 3.6441037991608027e-06,
+      "clip_ratio/low_mean": 7.513643731726916e-05,
+      "clip_ratio/low_min": 2.2551557776750997e-05,
+      "clip_ratio/region_mean": 7.878054020693526e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15556.0,
+      "completions/mean_length": 6953.8359375,
+      "completions/mean_terminated_length": 6570.49560546875,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "entropy": 0.8397975340485573,
+      "epoch": 0.5234590616375345,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.007468517404049635,
+      "learning_rate": 1e-05,
+      "loss": 0.0618,
+      "num_tokens": 501427056.0,
+      "reward": 0.421875,
+      "reward_std": 0.3571978807449341,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000053644180298,
+      "sampling/importance_sampling_ratio/min": 0.0001911464933073148,
+      "sampling/sampling_logp_difference/max": 8.562470436096191,
+      "sampling/sampling_logp_difference/mean": 0.01937997341156006,
+      "step": 569
+    },
+    {
+      "clip_ratio/high_max": 3.168922489749093e-05,
+      "clip_ratio/high_mean": 7.922306224372733e-06,
+      "clip_ratio/low_mean": 3.7468206755875144e-05,
+      "clip_ratio/low_min": 5.264044375508092e-06,
+      "clip_ratio/region_mean": 4.5390514060272835e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15961.0,
+      "completions/mean_length": 7807.09375,
+      "completions/mean_terminated_length": 7458.43896484375,
+      "completions/min_length": 562.0,
+      "completions/min_terminated_length": 562.0,
+      "entropy": 0.7974586114287376,
+      "epoch": 0.5243790248390064,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004324767272919416,
+      "learning_rate": 1e-05,
+      "loss": 0.0431,
+      "num_tokens": 502445156.0,
+      "reward": 0.265625,
+      "reward_std": 0.3329663574695587,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999243021011353,
+      "sampling/importance_sampling_ratio/min": 2.9874459869461134e-05,
+      "sampling/sampling_logp_difference/max": 10.418506622314453,
+      "sampling/sampling_logp_difference/mean": 0.018592730164527893,
+      "step": 570
+    },
+    {
+      "clip_ratio/high_max": 1.8414293663227e-05,
+      "clip_ratio/high_mean": 5.567038670051261e-06,
+      "clip_ratio/low_mean": 3.436269958001503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9929738250066293e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16133.0,
+      "completions/mean_length": 6467.890625,
+      "completions/mean_terminated_length": 6310.4921875,
+      "completions/min_length": 874.0,
+      "completions/min_terminated_length": 874.0,
+      "entropy": 0.8665193468332291,
+      "epoch": 0.5252989880404784,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0044867550022900105,
+      "learning_rate": 1e-05,
+      "loss": 0.0434,
+      "num_tokens": 503293398.0,
+      "reward": 0.4609375,
+      "reward_std": 0.2998581528663635,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999916136264801,
+      "sampling/importance_sampling_ratio/min": 0.024881144985556602,
+      "sampling/sampling_logp_difference/max": 3.6936450004577637,
+      "sampling/sampling_logp_difference/mean": 0.019022464752197266,
+      "step": 571
+    },
+    {
+      "clip_ratio/high_max": 1.4845849818811985e-05,
+      "clip_ratio/high_mean": 3.711462454702996e-06,
+      "clip_ratio/low_mean": 3.597185968828853e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.968332202930469e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16309.0,
+      "completions/mean_length": 6275.796875,
+      "completions/mean_terminated_length": 6115.349609375,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 0.8425783589482307,
+      "epoch": 0.5262189512419503,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0033805551938712597,
+      "learning_rate": 1e-05,
+      "loss": 0.0041,
+      "num_tokens": 504115692.0,
+      "reward": 0.3984375,
+      "reward_std": 0.2569621503353119,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000152587890625,
+      "sampling/importance_sampling_ratio/min": 0.018389537930488586,
+      "sampling/sampling_logp_difference/max": 3.9959733486175537,
+      "sampling/sampling_logp_difference/mean": 0.018935590982437134,
+      "step": 572
+    },
+    {
+      "clip_ratio/high_max": 4.3129479763592826e-05,
+      "clip_ratio/high_mean": 1.3471904480866215e-05,
+      "clip_ratio/low_mean": 1.670091853611666e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0172822903296037e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16116.0,
+      "completions/mean_length": 5396.7890625,
+      "completions/mean_terminated_length": 5222.38916015625,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.8558806329965591,
+      "epoch": 0.5271389144434223,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00652205478399992,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 504826577.0,
+      "reward": 0.546875,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999775886535645,
+      "sampling/importance_sampling_ratio/min": 0.0017056812066584826,
+      "sampling/sampling_logp_difference/max": 6.373790740966797,
+      "sampling/sampling_logp_difference/mean": 0.018737314268946648,
+      "step": 573
+    },
+    {
+      "clip_ratio/high_max": 6.914692676218692e-06,
+      "clip_ratio/high_mean": 1.728673169054673e-06,
+      "clip_ratio/low_mean": 2.3435458388121333e-05,
+      "clip_ratio/low_min": 3.954319709009724e-06,
+      "clip_ratio/region_mean": 2.5164132239297032e-05,
+      "completions/clipped_ratio": 0.0859375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16298.0,
+      "completions/mean_length": 7798.9765625,
+      "completions/mean_terminated_length": 6991.837890625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 0.8846152648329735,
+      "epoch": 0.5280588776448942,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0018958896398544312,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 505846438.0,
+      "reward": 0.328125,
+      "reward_std": 0.21253062784671783,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999515414237976,
+      "sampling/importance_sampling_ratio/min": 2.434831731079612e-05,
+      "sampling/sampling_logp_difference/max": 10.623047828674316,
+      "sampling/sampling_logp_difference/mean": 0.019361287355422974,
+      "step": 574
+    },
+    {
+      "clip_ratio/high_max": 1.085428675651201e-05,
+      "clip_ratio/high_mean": 5.064732249593362e-06,
+      "clip_ratio/low_mean": 5.590463968019321e-05,
+      "clip_ratio/low_min": 4.822531082027126e-06,
+      "clip_ratio/region_mean": 6.096937283928128e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16280.0,
+      "completions/mean_length": 6272.5546875,
+      "completions/mean_terminated_length": 6029.88037109375,
+      "completions/min_length": 901.0,
+      "completions/min_terminated_length": 901.0,
+      "entropy": 0.9714803844690323,
+      "epoch": 0.5289788408463661,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003035407979041338,
+      "learning_rate": 1e-05,
+      "loss": 0.1295,
+      "num_tokens": 506670477.0,
+      "reward": 0.3984375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999212026596069,
+      "sampling/importance_sampling_ratio/min": 0.0012103202752768993,
+      "sampling/sampling_logp_difference/max": 6.716870307922363,
+      "sampling/sampling_logp_difference/mean": 0.019988738000392914,
+      "step": 575
+    },
+    {
+      "clip_ratio/high_max": 2.1176599602767965e-05,
+      "clip_ratio/high_mean": 5.294149900691991e-06,
+      "clip_ratio/low_mean": 4.479086726405512e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.008501784686814e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16115.0,
+      "completions/mean_length": 6060.75,
+      "completions/mean_terminated_length": 5896.88916015625,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "entropy": 0.8791732639074326,
+      "epoch": 0.5298988040478381,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.005080445669591427,
+      "learning_rate": 1e-05,
+      "loss": 0.06,
+      "num_tokens": 507471717.0,
+      "reward": 0.421875,
+      "reward_std": 0.3135228157043457,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999859929084778,
+      "sampling/importance_sampling_ratio/min": 0.0025768836494535208,
+      "sampling/sampling_logp_difference/max": 5.961174488067627,
+      "sampling/sampling_logp_difference/mean": 0.019146449863910675,
+      "step": 576
+    },
+    {
+      "clip_ratio/high_max": 1.591328441463702e-05,
+      "clip_ratio/high_mean": 3.978321103659255e-06,
+      "clip_ratio/low_mean": 3.991827338722942e-05,
+      "clip_ratio/low_min": 4.394445568323135e-06,
+      "clip_ratio/region_mean": 4.389659511616628e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16319.0,
+      "completions/mean_length": 7221.65625,
+      "completions/mean_terminated_length": 7149.51171875,
+      "completions/min_length": 1071.0,
+      "completions/min_terminated_length": 1071.0,
+      "entropy": 0.9068904295563698,
+      "epoch": 0.53081876724931,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002491918858140707,
+      "learning_rate": 1e-05,
+      "loss": 0.0263,
+      "num_tokens": 508420417.0,
+      "reward": 0.3046875,
+      "reward_std": 0.22908622026443481,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999144077301025,
+      "sampling/importance_sampling_ratio/min": 0.0010015364969149232,
+      "sampling/sampling_logp_difference/max": 6.906219959259033,
+      "sampling/sampling_logp_difference/mean": 0.019857721403241158,
+      "step": 577
+    },
+    {
+      "clip_ratio/high_max": 2.723786337810452e-06,
+      "clip_ratio/high_mean": 6.80946584452613e-07,
+      "clip_ratio/low_mean": 4.729307283923845e-05,
+      "clip_ratio/low_min": 3.3817600524344016e-06,
+      "clip_ratio/region_mean": 4.7974018798413454e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16090.0,
+      "completions/mean_length": 7279.765625,
+      "completions/mean_terminated_length": 6909.67431640625,
+      "completions/min_length": 754.0,
+      "completions/min_terminated_length": 754.0,
+      "entropy": 0.7393763959407806,
+      "epoch": 0.531738730450782,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.0038857783656567335,
+      "learning_rate": 1e-05,
+      "loss": 0.1167,
+      "num_tokens": 509367579.0,
+      "reward": 0.5703125,
+      "reward_std": 0.3782213628292084,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999372959136963,
+      "sampling/importance_sampling_ratio/min": 8.482332486892119e-05,
+      "sampling/sampling_logp_difference/max": 9.374939918518066,
+      "sampling/sampling_logp_difference/mean": 0.01783195324242115,
+      "step": 578
+    },
+    {
+      "clip_ratio/high_max": 2.4269288587674964e-05,
+      "clip_ratio/high_mean": 6.067322146918741e-06,
+      "clip_ratio/low_mean": 5.770765028501046e-05,
+      "clip_ratio/low_min": 6.032236342434771e-06,
+      "clip_ratio/region_mean": 6.377497174980817e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15946.0,
+      "completions/max_terminated_length": 15946.0,
+      "completions/mean_length": 5381.4375,
+      "completions/mean_terminated_length": 5381.4375,
+      "completions/min_length": 1030.0,
+      "completions/min_terminated_length": 1030.0,
+      "entropy": 0.8337196409702301,
+      "epoch": 0.5326586936522539,
+      "frac_reward_zero_std": 0.0625,
+      "grad_norm": 0.004505726508796215,
+      "learning_rate": 1e-05,
+      "loss": 0.1534,
+      "num_tokens": 510076403.0,
+      "reward": 0.484375,
+      "reward_std": 0.3861297369003296,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999825358390808,
+      "sampling/importance_sampling_ratio/min": 0.0021874941885471344,
+      "sampling/sampling_logp_difference/max": 6.124998569488525,
+      "sampling/sampling_logp_difference/mean": 0.019285976886749268,
+      "step": 579
+    },
+    {
+      "clip_ratio/high_max": 1.83111833393923e-05,
+      "clip_ratio/high_mean": 4.577795834848075e-06,
+      "clip_ratio/low_mean": 4.1738339632502175e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.631613546735025e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15789.0,
+      "completions/mean_length": 8440.7109375,
+      "completions/mean_terminated_length": 8250.072265625,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "entropy": 0.8920768201351166,
+      "epoch": 0.5335786568537259,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0039497604593634605,
+      "learning_rate": 1e-05,
+      "loss": 0.0288,
+      "num_tokens": 511177974.0,
+      "reward": 0.1875,
+      "reward_std": 0.18990950286388397,
+      "rewards/accuracy_reward/mean": 0.1875,
+      "rewards/accuracy_reward/std": 0.39184603095054626,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910831451416,
+      "sampling/importance_sampling_ratio/min": 0.00021938055579084903,
+      "sampling/sampling_logp_difference/max": 8.424702644348145,
+      "sampling/sampling_logp_difference/mean": 0.020451124757528305,
+      "step": 580
+    },
+    {
+      "clip_ratio/high_max": 1.371111534353986e-05,
+      "clip_ratio/high_mean": 3.427778835884965e-06,
+      "clip_ratio/low_mean": 4.171912905803765e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.514690772339236e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16077.0,
+      "completions/mean_length": 6702.3828125,
+      "completions/mean_terminated_length": 6470.0244140625,
+      "completions/min_length": 1169.0,
+      "completions/min_terminated_length": 1169.0,
+      "entropy": 0.8600481152534485,
+      "epoch": 0.5344986200551978,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0024386425502598286,
+      "learning_rate": 1e-05,
+      "loss": 0.0866,
+      "num_tokens": 512054655.0,
+      "reward": 0.5703125,
+      "reward_std": 0.26645052433013916,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000202655792236,
+      "sampling/importance_sampling_ratio/min": 0.0015237311599776149,
+      "sampling/sampling_logp_difference/max": 6.486593246459961,
+      "sampling/sampling_logp_difference/mean": 0.018986206501722336,
+      "step": 581
+    },
+    {
+      "clip_ratio/high_max": 9.279537152906414e-06,
+      "clip_ratio/high_mean": 4.2680171645770315e-06,
+      "clip_ratio/low_mean": 2.6773893978315755e-05,
+      "clip_ratio/low_min": 4.736104074254399e-06,
+      "clip_ratio/region_mean": 3.1041911142892786e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13410.0,
+      "completions/mean_length": 4845.953125,
+      "completions/mean_terminated_length": 4755.1025390625,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "entropy": 0.9067303538322449,
+      "epoch": 0.5354185832566697,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0072782449424266815,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 512696537.0,
+      "reward": 0.4296875,
+      "reward_std": 0.29036980867385864,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999409317970276,
+      "sampling/importance_sampling_ratio/min": 0.017822081223130226,
+      "sampling/sampling_logp_difference/max": 4.027317047119141,
+      "sampling/sampling_logp_difference/mean": 0.01862735114991665,
+      "step": 582
+    },
+    {
+      "clip_ratio/high_max": 8.41807559481822e-06,
+      "clip_ratio/high_mean": 2.104518898704555e-06,
+      "clip_ratio/low_mean": 4.360654588708712e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.5711064331044327e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16282.0,
+      "completions/mean_length": 6173.171875,
+      "completions/mean_terminated_length": 6011.095703125,
+      "completions/min_length": 756.0,
+      "completions/min_terminated_length": 756.0,
+      "entropy": 0.9604142308235168,
+      "epoch": 0.5363385464581417,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005057654343545437,
+      "learning_rate": 1e-05,
+      "loss": 0.0799,
+      "num_tokens": 513505135.0,
+      "reward": 0.4375,
+      "reward_std": 0.2767051160335541,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999635219573975,
+      "sampling/importance_sampling_ratio/min": 0.0002380619989708066,
+      "sampling/sampling_logp_difference/max": 8.342979431152344,
+      "sampling/sampling_logp_difference/mean": 0.020879898220300674,
+      "step": 583
+    },
+    {
+      "clip_ratio/high_max": 7.327939783863258e-06,
+      "clip_ratio/high_mean": 3.227510205761064e-06,
+      "clip_ratio/low_mean": 4.2579683963595016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.580719428304292e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15173.0,
+      "completions/mean_length": 5546.5234375,
+      "completions/mean_terminated_length": 5374.50048828125,
+      "completions/min_length": 1113.0,
+      "completions/min_terminated_length": 1113.0,
+      "entropy": 0.8015405982732773,
+      "epoch": 0.5372585096596136,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0047672707587480545,
+      "learning_rate": 1e-05,
+      "loss": 0.0991,
+      "num_tokens": 514232058.0,
+      "reward": 0.4921875,
+      "reward_std": 0.27038949728012085,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999624490737915,
+      "sampling/importance_sampling_ratio/min": 5.8323133998783305e-05,
+      "sampling/sampling_logp_difference/max": 9.74951171875,
+      "sampling/sampling_logp_difference/mean": 0.018185433000326157,
+      "step": 584
+    },
+    {
+      "clip_ratio/high_max": 1.3804907666781219e-05,
+      "clip_ratio/high_mean": 4.388961428958282e-06,
+      "clip_ratio/low_mean": 5.04182496570138e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.480721097228525e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15778.0,
+      "completions/mean_length": 6637.359375,
+      "completions/mean_terminated_length": 6482.6513671875,
+      "completions/min_length": 1144.0,
+      "completions/min_terminated_length": 1144.0,
+      "entropy": 1.0173144191503525,
+      "epoch": 0.5381784728610856,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005850035231560469,
+      "learning_rate": 1e-05,
+      "loss": 0.0453,
+      "num_tokens": 515103184.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24988999962806702,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999963104724884,
+      "sampling/importance_sampling_ratio/min": 1.4479226706498594e-07,
+      "sampling/sampling_logp_difference/max": 15.747965812683105,
+      "sampling/sampling_logp_difference/mean": 0.020641878247261047,
+      "step": 585
+    },
+    {
+      "clip_ratio/high_max": 1.594428704265738e-05,
+      "clip_ratio/high_mean": 3.986071760664345e-06,
+      "clip_ratio/low_mean": 5.566071547491447e-05,
+      "clip_ratio/low_min": 8.978264304460026e-06,
+      "clip_ratio/region_mean": 5.964678746295249e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15716.0,
+      "completions/mean_length": 6940.6171875,
+      "completions/mean_terminated_length": 6866.259765625,
+      "completions/min_length": 1273.0,
+      "completions/min_terminated_length": 1273.0,
+      "entropy": 0.8547529205679893,
+      "epoch": 0.5390984360625575,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0037875184789299965,
+      "learning_rate": 1e-05,
+      "loss": 0.0831,
+      "num_tokens": 516009791.0,
+      "reward": 0.4765625,
+      "reward_std": 0.27222442626953125,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999997615814209,
+      "sampling/importance_sampling_ratio/min": 5.772008080384694e-06,
+      "sampling/sampling_logp_difference/max": 12.062490463256836,
+      "sampling/sampling_logp_difference/mean": 0.018527517095208168,
+      "step": 586
+    },
+    {
+      "clip_ratio/high_max": 6.924382887518732e-06,
+      "clip_ratio/high_mean": 1.731095721879683e-06,
+      "clip_ratio/low_mean": 3.340147941344185e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.5132575476382044e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15387.0,
+      "completions/mean_length": 6837.125,
+      "completions/mean_terminated_length": 6761.95263671875,
+      "completions/min_length": 1319.0,
+      "completions/min_terminated_length": 1319.0,
+      "entropy": 0.9027494043111801,
+      "epoch": 0.5400183992640294,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0015506440540775657,
+      "learning_rate": 1e-05,
+      "loss": 0.0502,
+      "num_tokens": 516903335.0,
+      "reward": 0.296875,
+      "reward_std": 0.20593318343162537,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999232292175293,
+      "sampling/importance_sampling_ratio/min": 4.2636147554730996e-05,
+      "sampling/sampling_logp_difference/max": 10.0628080368042,
+      "sampling/sampling_logp_difference/mean": 0.020130250602960587,
+      "step": 587
+    },
+    {
+      "clip_ratio/high_max": 1.2774215747413109e-05,
+      "clip_ratio/high_mean": 3.1935539368532773e-06,
+      "clip_ratio/low_mean": 3.885528553837503e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.204883930469805e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7866.703125,
+      "completions/mean_terminated_length": 7222.5380859375,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "entropy": 0.8133657574653625,
+      "epoch": 0.5409383624655014,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003520917845889926,
+      "learning_rate": 1e-05,
+      "loss": 0.1165,
+      "num_tokens": 517929081.0,
+      "reward": 0.4453125,
+      "reward_std": 0.3316730856895447,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999421834945679,
+      "sampling/importance_sampling_ratio/min": 6.223546370165423e-05,
+      "sampling/sampling_logp_difference/max": 9.684585571289062,
+      "sampling/sampling_logp_difference/mean": 0.01890747994184494,
+      "step": 588
+    },
+    {
+      "clip_ratio/high_max": 6.942207619431429e-06,
+      "clip_ratio/high_mean": 1.7355519048578572e-06,
+      "clip_ratio/low_mean": 3.457626269209868e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.631181459695654e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15944.0,
+      "completions/mean_length": 6701.296875,
+      "completions/mean_terminated_length": 6547.603515625,
+      "completions/min_length": 24.0,
+      "completions/min_terminated_length": 24.0,
+      "entropy": 0.9360691756010056,
+      "epoch": 0.5418583256669733,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0029796145390719175,
+      "learning_rate": 1e-05,
+      "loss": 0.0447,
+      "num_tokens": 518810247.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2869499921798706,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999505877494812,
+      "sampling/importance_sampling_ratio/min": 2.520391673144218e-10,
+      "sampling/sampling_logp_difference/max": 22.101436614990234,
+      "sampling/sampling_logp_difference/mean": 0.01977725327014923,
+      "step": 589
+    },
+    {
+      "clip_ratio/high_max": 3.7906356737948954e-06,
+      "clip_ratio/high_mean": 9.476589184487239e-07,
+      "clip_ratio/low_mean": 3.738725240509666e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8334911323545384e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15971.0,
+      "completions/mean_length": 7029.453125,
+      "completions/mean_terminated_length": 6804.9443359375,
+      "completions/min_length": 1180.0,
+      "completions/min_terminated_length": 1180.0,
+      "entropy": 0.9168537557125092,
+      "epoch": 0.5427782888684453,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0024249793495982885,
+      "learning_rate": 1e-05,
+      "loss": 0.0477,
+      "num_tokens": 519730577.0,
+      "reward": 0.390625,
+      "reward_std": 0.22803518176078796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999786615371704,
+      "sampling/importance_sampling_ratio/min": 1.6278204384434503e-07,
+      "sampling/sampling_logp_difference/max": 15.630853652954102,
+      "sampling/sampling_logp_difference/mean": 0.01923082396388054,
+      "step": 590
+    },
+    {
+      "clip_ratio/high_max": 2.4759768621152034e-05,
+      "clip_ratio/high_mean": 6.1899421552880085e-06,
+      "clip_ratio/low_mean": 3.2254738812298456e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.8444680967586464e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15600.0,
+      "completions/mean_length": 7255.453125,
+      "completions/mean_terminated_length": 6646.8837890625,
+      "completions/min_length": 832.0,
+      "completions/min_terminated_length": 832.0,
+      "entropy": 0.8241118341684341,
+      "epoch": 0.5436982520699172,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003160425927489996,
+      "learning_rate": 1e-05,
+      "loss": 0.0821,
+      "num_tokens": 520680707.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2461756467819214,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000334978103638,
+      "sampling/importance_sampling_ratio/min": 0.0009408618789166212,
+      "sampling/sampling_logp_difference/max": 6.968714237213135,
+      "sampling/sampling_logp_difference/mean": 0.019255205988883972,
+      "step": 591
+    },
+    {
+      "clip_ratio/high_max": 7.459808557541692e-06,
+      "clip_ratio/high_mean": 1.864952139385423e-06,
+      "clip_ratio/low_mean": 3.9836502310208743e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.170145416537707e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 7819.96875,
+      "completions/mean_terminated_length": 7752.53564453125,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 512.0,
+      "entropy": 1.1218742430210114,
+      "epoch": 0.5446182152713891,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.00411194609478116,
+      "learning_rate": 1e-05,
+      "loss": 0.0267,
+      "num_tokens": 521703303.0,
+      "reward": 0.2265625,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.2265625,
+      "rewards/accuracy_reward/std": 0.4202519655227661,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999041557312012,
+      "sampling/importance_sampling_ratio/min": 0.0003571478300727904,
+      "sampling/sampling_logp_difference/max": 7.937360763549805,
+      "sampling/sampling_logp_difference/mean": 0.022727783769369125,
+      "step": 592
+    },
+    {
+      "clip_ratio/high_max": 1.8858649582398357e-05,
+      "clip_ratio/high_mean": 4.714662395599589e-06,
+      "clip_ratio/low_mean": 3.738353416338214e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.2098196558981726e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16117.0,
+      "completions/mean_length": 6322.8671875,
+      "completions/mean_terminated_length": 6163.1669921875,
+      "completions/min_length": 637.0,
+      "completions/min_terminated_length": 637.0,
+      "entropy": 0.8323960080742836,
+      "epoch": 0.5455381784728611,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022753921803086996,
+      "learning_rate": 1e-05,
+      "loss": 0.0339,
+      "num_tokens": 522531422.0,
+      "reward": 0.4140625,
+      "reward_std": 0.20753081142902374,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998952150344849,
+      "sampling/importance_sampling_ratio/min": 5.422274170996388e-06,
+      "sampling/sampling_logp_difference/max": 12.124995231628418,
+      "sampling/sampling_logp_difference/mean": 0.01893780007958412,
+      "step": 593
+    },
+    {
+      "clip_ratio/high_max": 3.977598225901602e-06,
+      "clip_ratio/high_mean": 9.943995564754005e-07,
+      "clip_ratio/low_mean": 1.1187657776190463e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.2182057332665863e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16055.0,
+      "completions/mean_length": 7054.0625,
+      "completions/mean_terminated_length": 6905.96875,
+      "completions/min_length": 101.0,
+      "completions/min_terminated_length": 101.0,
+      "entropy": 0.866028867661953,
+      "epoch": 0.546458141674333,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.004338000901043415,
+      "learning_rate": 1e-05,
+      "loss": -0.0134,
+      "num_tokens": 523453262.0,
+      "reward": 0.328125,
+      "reward_std": 0.13204573094844818,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998721480369568,
+      "sampling/importance_sampling_ratio/min": 7.97068714746274e-05,
+      "sampling/sampling_logp_difference/max": 9.437154769897461,
+      "sampling/sampling_logp_difference/mean": 0.01982954889535904,
+      "step": 594
+    },
+    {
+      "clip_ratio/high_max": 1.5038514220577781e-05,
+      "clip_ratio/high_mean": 3.7596285551444453e-06,
+      "clip_ratio/low_mean": 3.533169467573316e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.9091323742468376e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16361.0,
+      "completions/mean_length": 7539.0703125,
+      "completions/mean_terminated_length": 7027.3798828125,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "entropy": 0.8601142391562462,
+      "epoch": 0.547378104875805,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003401415189728141,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 524436831.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2511882185935974,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999969482421875,
+      "sampling/importance_sampling_ratio/min": 2.0915547793265432e-05,
+      "sampling/sampling_logp_difference/max": 10.775017738342285,
+      "sampling/sampling_logp_difference/mean": 0.019884679466485977,
+      "step": 595
+    },
+    {
+      "clip_ratio/high_max": 2.9679867111553904e-05,
+      "clip_ratio/high_mean": 8.187421713046206e-06,
+      "clip_ratio/low_mean": 5.44505830930575e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.263800514716422e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16343.0,
+      "completions/mean_length": 7137.96875,
+      "completions/mean_terminated_length": 6762.11376953125,
+      "completions/min_length": 606.0,
+      "completions/min_terminated_length": 606.0,
+      "entropy": 0.7909424379467964,
+      "epoch": 0.5482980680772769,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002879115054383874,
+      "learning_rate": 1e-05,
+      "loss": 0.0549,
+      "num_tokens": 525368091.0,
+      "reward": 0.546875,
+      "reward_std": 0.27062684297561646,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000025033950806,
+      "sampling/importance_sampling_ratio/min": 0.0004618439415935427,
+      "sampling/sampling_logp_difference/max": 7.680283546447754,
+      "sampling/sampling_logp_difference/mean": 0.01847894862294197,
+      "step": 596
+    },
+    {
+      "clip_ratio/high_max": 5.765416517533595e-06,
+      "clip_ratio/high_mean": 1.4413541293833987e-06,
+      "clip_ratio/low_mean": 3.1269102407804894e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.2710456423501455e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16208.0,
+      "completions/mean_length": 5486.3671875,
+      "completions/mean_terminated_length": 5224.82421875,
+      "completions/min_length": 4.0,
+      "completions/min_terminated_length": 4.0,
+      "entropy": 0.9588652476668358,
+      "epoch": 0.5492180312787488,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004545152187347412,
+      "learning_rate": 1e-05,
+      "loss": 0.0549,
+      "num_tokens": 526095378.0,
+      "reward": 0.359375,
+      "reward_std": 0.33508801460266113,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998891353607178,
+      "sampling/importance_sampling_ratio/min": 6.280510569922626e-05,
+      "sampling/sampling_logp_difference/max": 9.675474166870117,
+      "sampling/sampling_logp_difference/mean": 0.02017204463481903,
+      "step": 597
+    },
+    {
+      "clip_ratio/high_max": 1.519483475931338e-05,
+      "clip_ratio/high_mean": 4.732241109195456e-06,
+      "clip_ratio/low_mean": 4.477498589494644e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.950722734520241e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16169.0,
+      "completions/max_terminated_length": 16169.0,
+      "completions/mean_length": 6636.0078125,
+      "completions/mean_terminated_length": 6636.0078125,
+      "completions/min_length": 685.0,
+      "completions/min_terminated_length": 685.0,
+      "entropy": 0.9497648254036903,
+      "epoch": 0.5501379944802208,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004040954168885946,
+      "learning_rate": 1e-05,
+      "loss": 0.0477,
+      "num_tokens": 526969459.0,
+      "reward": 0.3515625,
+      "reward_std": 0.3158818483352661,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999474287033081,
+      "sampling/importance_sampling_ratio/min": 2.2340275407373156e-08,
+      "sampling/sampling_logp_difference/max": 17.61687469482422,
+      "sampling/sampling_logp_difference/mean": 0.02086419239640236,
+      "step": 598
+    },
+    {
+      "clip_ratio/high_max": 1.5785165032866644e-05,
+      "clip_ratio/high_mean": 3.946291258216661e-06,
+      "clip_ratio/low_mean": 4.7215530003086315e-05,
+      "clip_ratio/low_min": 5.274039267533226e-06,
+      "clip_ratio/region_mean": 5.116182205711084e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15820.0,
+      "completions/mean_length": 6462.953125,
+      "completions/mean_terminated_length": 6142.9189453125,
+      "completions/min_length": 824.0,
+      "completions/min_terminated_length": 824.0,
+      "entropy": 0.9401230812072754,
+      "epoch": 0.5510579576816927,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004678349941968918,
+      "learning_rate": 1e-05,
+      "loss": 0.1854,
+      "num_tokens": 527822197.0,
+      "reward": 0.5234375,
+      "reward_std": 0.3345640003681183,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997877478599548,
+      "sampling/importance_sampling_ratio/min": 2.8560234568431042e-05,
+      "sampling/sampling_logp_difference/max": 10.463495254516602,
+      "sampling/sampling_logp_difference/mean": 0.019832316786050797,
+      "step": 599
+    },
+    {
+      "clip_ratio/high_max": 4.1415414671064354e-06,
+      "clip_ratio/high_mean": 1.0353853667766089e-06,
+      "clip_ratio/low_mean": 4.795687004843785e-05,
+      "clip_ratio/low_min": 7.76807610236574e-06,
+      "clip_ratio/region_mean": 4.899225518784078e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15170.0,
+      "completions/mean_length": 7172.1015625,
+      "completions/mean_terminated_length": 6951.01611328125,
+      "completions/min_length": 1079.0,
+      "completions/min_terminated_length": 1079.0,
+      "entropy": 0.7962061613798141,
+      "epoch": 0.5519779208831647,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0014094997895881534,
+      "learning_rate": 1e-05,
+      "loss": 0.0668,
+      "num_tokens": 528759458.0,
+      "reward": 0.3515625,
+      "reward_std": 0.16834919154644012,
+      "rewards/accuracy_reward/mean": 0.3515625,
+      "rewards/accuracy_reward/std": 0.4793342351913452,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999281167984009,
+      "sampling/importance_sampling_ratio/min": 0.001331693259999156,
+      "sampling/sampling_logp_difference/max": 6.621304035186768,
+      "sampling/sampling_logp_difference/mean": 0.018519852310419083,
+      "step": 600
+    },
+    {
+      "clip_ratio/high_max": 7.3846517807396594e-06,
+      "clip_ratio/high_mean": 3.018199095095042e-06,
+      "clip_ratio/low_mean": 5.2064756346226204e-05,
+      "clip_ratio/low_min": 5.341652013157727e-06,
+      "clip_ratio/region_mean": 5.5082955441321246e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16195.0,
+      "completions/mean_length": 6612.6484375,
+      "completions/mean_terminated_length": 6378.13623046875,
+      "completions/min_length": 480.0,
+      "completions/min_terminated_length": 480.0,
+      "entropy": 0.8218385726213455,
+      "epoch": 0.5528978840846366,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0038943374529480934,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "num_tokens": 529626893.0,
+      "reward": 0.390625,
+      "reward_std": 0.23934084177017212,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999620914459229,
+      "sampling/importance_sampling_ratio/min": 0.0024450027849525213,
+      "sampling/sampling_logp_difference/max": 6.01370906829834,
+      "sampling/sampling_logp_difference/mean": 0.018441151827573776,
+      "step": 601
+    },
+    {
+      "clip_ratio/high_max": 8.209965471905889e-06,
+      "clip_ratio/high_mean": 2.0524913679764722e-06,
+      "clip_ratio/low_mean": 4.8717710285473004e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.077020244925734e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15898.0,
+      "completions/mean_length": 6574.9140625,
+      "completions/mean_terminated_length": 6419.21484375,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.9268836230039597,
+      "epoch": 0.5538178472861086,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027088895440101624,
+      "learning_rate": 1e-05,
+      "loss": 0.0577,
+      "num_tokens": 530486578.0,
+      "reward": 0.4453125,
+      "reward_std": 0.26143792271614075,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000026822090149,
+      "sampling/importance_sampling_ratio/min": 1.1735714906535577e-05,
+      "sampling/sampling_logp_difference/max": 11.352873802185059,
+      "sampling/sampling_logp_difference/mean": 0.020115964114665985,
+      "step": 602
+    },
+    {
+      "clip_ratio/high_max": 5.24967435922008e-06,
+      "clip_ratio/high_mean": 1.31241858980502e-06,
+      "clip_ratio/low_mean": 1.3909025255998131e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.5221443845803151e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14361.0,
+      "completions/mean_length": 6209.1953125,
+      "completions/mean_terminated_length": 6129.07861328125,
+      "completions/min_length": 598.0,
+      "completions/min_terminated_length": 598.0,
+      "entropy": 0.9574517607688904,
+      "epoch": 0.5547378104875805,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.002628365531563759,
+      "learning_rate": 1e-05,
+      "loss": 0.0461,
+      "num_tokens": 531303083.0,
+      "reward": 0.3671875,
+      "reward_std": 0.13098490238189697,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998608827590942,
+      "sampling/importance_sampling_ratio/min": 2.862734254449606e-05,
+      "sampling/sampling_logp_difference/max": 10.461148262023926,
+      "sampling/sampling_logp_difference/mean": 0.019658785313367844,
+      "step": 603
+    },
+    {
+      "clip_ratio/high_max": 1.9014597455679905e-05,
+      "clip_ratio/high_mean": 4.753649363919976e-06,
+      "clip_ratio/low_mean": 4.9158792762682424e-05,
+      "clip_ratio/low_min": 4.514427928370424e-06,
+      "clip_ratio/region_mean": 5.39124412171077e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13873.0,
+      "completions/mean_length": 7079.1875,
+      "completions/mean_terminated_length": 6855.87255859375,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 0.853938102722168,
+      "epoch": 0.5556577736890524,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004664157051593065,
+      "learning_rate": 1e-05,
+      "loss": 0.0285,
+      "num_tokens": 532228227.0,
+      "reward": 0.2734375,
+      "reward_std": 0.30327796936035156,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999879598617554,
+      "sampling/importance_sampling_ratio/min": 5.377535785555665e-07,
+      "sampling/sampling_logp_difference/max": 14.43586540222168,
+      "sampling/sampling_logp_difference/mean": 0.018260695040225983,
+      "step": 604
+    },
+    {
+      "clip_ratio/high_max": 3.025483556484687e-05,
+      "clip_ratio/high_mean": 7.563708891211718e-06,
+      "clip_ratio/low_mean": 2.1738228269896354e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9301936820047558e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15094.0,
+      "completions/max_terminated_length": 15094.0,
+      "completions/mean_length": 6071.5390625,
+      "completions/mean_terminated_length": 6071.5390625,
+      "completions/min_length": 742.0,
+      "completions/min_terminated_length": 742.0,
+      "entropy": 0.980722151696682,
+      "epoch": 0.5565777368905244,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004579839296638966,
+      "learning_rate": 1e-05,
+      "loss": 0.0168,
+      "num_tokens": 533024264.0,
+      "reward": 0.4765625,
+      "reward_std": 0.30327799916267395,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999982476234436,
+      "sampling/importance_sampling_ratio/min": 0.0003390153287909925,
+      "sampling/sampling_logp_difference/max": 7.989465236663818,
+      "sampling/sampling_logp_difference/mean": 0.01974770799279213,
+      "step": 605
+    },
+    {
+      "clip_ratio/high_max": 1.3344870239961892e-05,
+      "clip_ratio/high_mean": 4.773990667672479e-06,
+      "clip_ratio/low_mean": 5.142044130934664e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6194432318079635e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16086.0,
+      "completions/mean_length": 7352.484375,
+      "completions/mean_terminated_length": 7209.12744140625,
+      "completions/min_length": 1310.0,
+      "completions/min_terminated_length": 1310.0,
+      "entropy": 0.7858814746141434,
+      "epoch": 0.5574977000919963,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.002537919208407402,
+      "learning_rate": 1e-05,
+      "loss": 0.0576,
+      "num_tokens": 533985318.0,
+      "reward": 0.3125,
+      "reward_std": 0.2580229938030243,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999037981033325,
+      "sampling/importance_sampling_ratio/min": 0.0017827138071879745,
+      "sampling/sampling_logp_difference/max": 6.329618453979492,
+      "sampling/sampling_logp_difference/mean": 0.018647275865077972,
+      "step": 606
+    },
+    {
+      "clip_ratio/high_max": 2.345925531699322e-05,
+      "clip_ratio/high_mean": 7.0977013137962786e-06,
+      "clip_ratio/low_mean": 4.466222731025482e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.175992941985896e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16082.0,
+      "completions/mean_length": 7095.1875,
+      "completions/mean_terminated_length": 6947.74658203125,
+      "completions/min_length": 1073.0,
+      "completions/min_terminated_length": 1073.0,
+      "entropy": 0.6846291124820709,
+      "epoch": 0.5584176632934683,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037982286885380745,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 534912558.0,
+      "reward": 0.53125,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999147057533264,
+      "sampling/importance_sampling_ratio/min": 8.089523180387914e-05,
+      "sampling/sampling_logp_difference/max": 9.422355651855469,
+      "sampling/sampling_logp_difference/mean": 0.01693977229297161,
+      "step": 607
+    },
+    {
+      "clip_ratio/high_max": 5.167851668375079e-06,
+      "clip_ratio/high_mean": 1.2919629170937696e-06,
+      "clip_ratio/low_mean": 6.557838094067847e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.687034363039857e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15572.0,
+      "completions/mean_length": 6038.1953125,
+      "completions/mean_terminated_length": 5873.9765625,
+      "completions/min_length": 677.0,
+      "completions/min_terminated_length": 677.0,
+      "entropy": 0.8637901693582535,
+      "epoch": 0.5593376264949402,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0030545955523848534,
+      "learning_rate": 1e-05,
+      "loss": 0.0716,
+      "num_tokens": 535707127.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3243142366409302,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999387264251709,
+      "sampling/importance_sampling_ratio/min": 0.00017956242663785815,
+      "sampling/sampling_logp_difference/max": 8.624987602233887,
+      "sampling/sampling_logp_difference/mean": 0.018705151975154877,
+      "step": 608
+    },
+    {
+      "clip_ratio/high_max": 1.7691760149318725e-05,
+      "clip_ratio/high_mean": 5.544901910070621e-06,
+      "clip_ratio/low_mean": 5.012885230826214e-05,
+      "clip_ratio/low_min": 3.5653165468829684e-06,
+      "clip_ratio/region_mean": 5.5673754559393274e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14906.0,
+      "completions/mean_length": 6978.0078125,
+      "completions/mean_terminated_length": 6828.70654296875,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "entropy": 0.7931060045957565,
+      "epoch": 0.5602575896964122,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002951717935502529,
+      "learning_rate": 1e-05,
+      "loss": 0.0698,
+      "num_tokens": 536618376.0,
+      "reward": 0.46875,
+      "reward_std": 0.3527044355869293,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999598264694214,
+      "sampling/importance_sampling_ratio/min": 3.865327380481176e-05,
+      "sampling/sampling_logp_difference/max": 10.160879135131836,
+      "sampling/sampling_logp_difference/mean": 0.018486514687538147,
+      "step": 609
+    },
+    {
+      "clip_ratio/high_max": 2.1591150925814873e-05,
+      "clip_ratio/high_mean": 5.397787731453718e-06,
+      "clip_ratio/low_mean": 6.101864732954709e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.6416435629435e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15329.0,
+      "completions/max_terminated_length": 15329.0,
+      "completions/mean_length": 6810.15625,
+      "completions/mean_terminated_length": 6810.15625,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.8957240954041481,
+      "epoch": 0.5611775528978841,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0019385438645258546,
+      "learning_rate": 1e-05,
+      "loss": 0.0973,
+      "num_tokens": 537513876.0,
+      "reward": 0.328125,
+      "reward_std": 0.28011518716812134,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000025749206543,
+      "sampling/importance_sampling_ratio/min": 4.845474904868752e-05,
+      "sampling/sampling_logp_difference/max": 9.934880256652832,
+      "sampling/sampling_logp_difference/mean": 0.02021351456642151,
+      "step": 610
+    },
+    {
+      "clip_ratio/high_max": 1.4817902865615906e-05,
+      "clip_ratio/high_mean": 5.914362077419355e-06,
+      "clip_ratio/low_mean": 1.2616926369446446e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8531288333178964e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16065.0,
+      "completions/mean_length": 6940.4140625,
+      "completions/mean_terminated_length": 6713.7685546875,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.8646975234150887,
+      "epoch": 0.562097516099356,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.001886329147964716,
+      "learning_rate": 1e-05,
+      "loss": 0.0319,
+      "num_tokens": 538419265.0,
+      "reward": 0.375,
+      "reward_std": 0.19568344950675964,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000052452087402,
+      "sampling/importance_sampling_ratio/min": 6.893687327647058e-07,
+      "sampling/sampling_logp_difference/max": 14.18748950958252,
+      "sampling/sampling_logp_difference/mean": 0.019072774797677994,
+      "step": 611
+    },
+    {
+      "clip_ratio/high_max": 6.3681300161988474e-06,
+      "clip_ratio/high_mean": 1.5920325040497119e-06,
+      "clip_ratio/low_mean": 3.254086982451554e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4132902555938927e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15960.0,
+      "completions/mean_length": 7508.796875,
+      "completions/mean_terminated_length": 6995.35498046875,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 0.7723299860954285,
+      "epoch": 0.563017479300828,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002031022449955344,
+      "learning_rate": 1e-05,
+      "loss": 0.0335,
+      "num_tokens": 539399127.0,
+      "reward": 0.4296875,
+      "reward_std": 0.2301519513130188,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999816417694092,
+      "sampling/importance_sampling_ratio/min": 0.0056421491317451,
+      "sampling/sampling_logp_difference/max": 5.177490234375,
+      "sampling/sampling_logp_difference/mean": 0.01832709088921547,
+      "step": 612
+    },
+    {
+      "clip_ratio/high_max": 1.5848977909627138e-05,
+      "clip_ratio/high_mean": 3.9622444774067844e-06,
+      "clip_ratio/low_mean": 2.6742804038804024e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.070504851621081e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15816.0,
+      "completions/mean_length": 6019.6484375,
+      "completions/mean_terminated_length": 5938.03955078125,
+      "completions/min_length": 1020.0,
+      "completions/min_terminated_length": 1020.0,
+      "entropy": 0.7425512671470642,
+      "epoch": 0.5639374425022999,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003653773572295904,
+      "learning_rate": 1e-05,
+      "loss": 0.1072,
+      "num_tokens": 540189602.0,
+      "reward": 0.53125,
+      "reward_std": 0.26143303513526917,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999122619628906,
+      "sampling/importance_sampling_ratio/min": 0.005288486368954182,
+      "sampling/sampling_logp_difference/max": 5.242223262786865,
+      "sampling/sampling_logp_difference/mean": 0.017161473631858826,
+      "step": 613
+    },
+    {
+      "clip_ratio/high_max": 1.1017190900020069e-05,
+      "clip_ratio/high_mean": 2.754297725005017e-06,
+      "clip_ratio/low_mean": 3.428678644468164e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7041084169686656e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15861.0,
+      "completions/mean_length": 7155.6953125,
+      "completions/mean_terminated_length": 6621.826171875,
+      "completions/min_length": 987.0,
+      "completions/min_terminated_length": 987.0,
+      "entropy": 0.9789249897003174,
+      "epoch": 0.5648574057037719,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003739065257832408,
+      "learning_rate": 1e-05,
+      "loss": 0.0346,
+      "num_tokens": 541125587.0,
+      "reward": 0.265625,
+      "reward_std": 0.2522490322589874,
+      "rewards/accuracy_reward/mean": 0.265625,
+      "rewards/accuracy_reward/std": 0.44340085983276367,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999271631240845,
+      "sampling/importance_sampling_ratio/min": 9.236609002982732e-06,
+      "sampling/sampling_logp_difference/max": 11.59233570098877,
+      "sampling/sampling_logp_difference/mean": 0.02008877694606781,
+      "step": 614
+    },
+    {
+      "clip_ratio/high_max": 5.6091539590852335e-06,
+      "clip_ratio/high_mean": 2.4549021873099264e-06,
+      "clip_ratio/low_mean": 4.249646542575647e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.4951367613066395e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13553.0,
+      "completions/mean_length": 8027.359375,
+      "completions/mean_terminated_length": 7470.25048828125,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 0.9153474718332291,
+      "epoch": 0.5657773689052438,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0020656392443925142,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "num_tokens": 542173801.0,
+      "reward": 0.2578125,
+      "reward_std": 0.22225633263587952,
+      "rewards/accuracy_reward/mean": 0.2578125,
+      "rewards/accuracy_reward/std": 0.43914902210235596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999947190284729,
+      "sampling/importance_sampling_ratio/min": 0.00029620854184031487,
+      "sampling/sampling_logp_difference/max": 8.124446868896484,
+      "sampling/sampling_logp_difference/mean": 0.021495234221220016,
+      "step": 615
+    },
+    {
+      "clip_ratio/high_max": 1.7302586002188036e-05,
+      "clip_ratio/high_mean": 4.325646500547009e-06,
+      "clip_ratio/low_mean": 5.2193488272678223e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.6519134659538395e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 6115.3828125,
+      "completions/mean_terminated_length": 5952.38916015625,
+      "completions/min_length": 1158.0,
+      "completions/min_terminated_length": 1158.0,
+      "entropy": 0.751783661544323,
+      "epoch": 0.5666973321067157,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00824788399040699,
+      "learning_rate": 1e-05,
+      "loss": 0.0648,
+      "num_tokens": 542977266.0,
+      "reward": 0.4609375,
+      "reward_std": 0.30616888403892517,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999478459358215,
+      "sampling/importance_sampling_ratio/min": 0.0013296925462782383,
+      "sampling/sampling_logp_difference/max": 6.622807502746582,
+      "sampling/sampling_logp_difference/mean": 0.017732972279191017,
+      "step": 616
+    },
+    {
+      "clip_ratio/high_max": 2.872588265745435e-05,
+      "clip_ratio/high_mean": 8.185486876755022e-06,
+      "clip_ratio/low_mean": 5.301810256241879e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.120358921180014e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15688.0,
+      "completions/mean_length": 7431.3203125,
+      "completions/mean_terminated_length": 7142.52392578125,
+      "completions/min_length": 738.0,
+      "completions/min_terminated_length": 738.0,
+      "entropy": 0.9122852608561516,
+      "epoch": 0.5676172953081877,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005189655348658562,
+      "learning_rate": 1e-05,
+      "loss": 0.0613,
+      "num_tokens": 543947515.0,
+      "reward": 0.484375,
+      "reward_std": 0.21595832705497742,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999845623970032,
+      "sampling/importance_sampling_ratio/min": 0.00017607140762265772,
+      "sampling/sampling_logp_difference/max": 8.644620895385742,
+      "sampling/sampling_logp_difference/mean": 0.02111673541367054,
+      "step": 617
+    },
+    {
+      "clip_ratio/high_max": 3.984698651038343e-06,
+      "clip_ratio/high_mean": 9.961746627595858e-07,
+      "clip_ratio/low_mean": 3.414959587644262e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.514577088026272e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16378.0,
+      "completions/mean_length": 5700.5546875,
+      "completions/mean_terminated_length": 5530.9765625,
+      "completions/min_length": 727.0,
+      "completions/min_terminated_length": 727.0,
+      "entropy": 0.8961661159992218,
+      "epoch": 0.5685372585096596,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004707770887762308,
+      "learning_rate": 1e-05,
+      "loss": 0.0773,
+      "num_tokens": 544694826.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998490214347839,
+      "sampling/importance_sampling_ratio/min": 5.211461817644647e-10,
+      "sampling/sampling_logp_difference/max": 21.374990463256836,
+      "sampling/sampling_logp_difference/mean": 0.018697837367653847,
+      "step": 618
+    },
+    {
+      "clip_ratio/high_max": 1.1809721399913542e-05,
+      "clip_ratio/high_mean": 2.9524303499783855e-06,
+      "clip_ratio/low_mean": 5.229935004535946e-05,
+      "clip_ratio/low_min": 4.098226327187149e-06,
+      "clip_ratio/region_mean": 5.525178062271152e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 12422.0,
+      "completions/max_terminated_length": 12422.0,
+      "completions/mean_length": 4201.6796875,
+      "completions/mean_terminated_length": 4201.6796875,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "entropy": 0.7066933363676071,
+      "epoch": 0.5694572217111316,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.00980924628674984,
+      "learning_rate": 1e-05,
+      "loss": 0.0492,
+      "num_tokens": 545255377.0,
+      "reward": 0.5625,
+      "reward_std": 0.38664889335632324,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000074028968811,
+      "sampling/importance_sampling_ratio/min": 7.827866647858173e-05,
+      "sampling/sampling_logp_difference/max": 9.455235481262207,
+      "sampling/sampling_logp_difference/mean": 0.016301468014717102,
+      "step": 619
+    },
+    {
+      "clip_ratio/high_max": 6.093102456361521e-06,
+      "clip_ratio/high_mean": 1.5232756140903803e-06,
+      "clip_ratio/low_mean": 1.853809601470857e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.0061371856172627e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13234.0,
+      "completions/mean_length": 5782.2578125,
+      "completions/mean_terminated_length": 5613.9765625,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 0.846621498465538,
+      "epoch": 0.5703771849126035,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005619424395263195,
+      "learning_rate": 1e-05,
+      "loss": 0.077,
+      "num_tokens": 546013882.0,
+      "reward": 0.46875,
+      "reward_std": 0.2472364753484726,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000319480895996,
+      "sampling/importance_sampling_ratio/min": 9.447568299947307e-05,
+      "sampling/sampling_logp_difference/max": 9.267168045043945,
+      "sampling/sampling_logp_difference/mean": 0.018704919144511223,
+      "step": 620
+    },
+    {
+      "clip_ratio/high_max": 1.6747734207456233e-05,
+      "clip_ratio/high_mean": 4.186933551864058e-06,
+      "clip_ratio/low_mean": 4.008232758678787e-05,
+      "clip_ratio/low_min": 3.511630438879365e-06,
+      "clip_ratio/region_mean": 4.426926193445979e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15569.0,
+      "completions/mean_length": 7191.4921875,
+      "completions/mean_terminated_length": 7045.57958984375,
+      "completions/min_length": 1379.0,
+      "completions/min_terminated_length": 1379.0,
+      "entropy": 0.7846563309431076,
+      "epoch": 0.5712971481140754,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0063271005637943745,
+      "learning_rate": 1e-05,
+      "loss": 0.0964,
+      "num_tokens": 546954857.0,
+      "reward": 0.4296875,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999164342880249,
+      "sampling/importance_sampling_ratio/min": 0.006330032367259264,
+      "sampling/sampling_logp_difference/max": 5.062449932098389,
+      "sampling/sampling_logp_difference/mean": 0.01846012845635414,
+      "step": 621
+    },
+    {
+      "clip_ratio/high_max": 3.451678094279487e-05,
+      "clip_ratio/high_mean": 1.2486661603361426e-05,
+      "clip_ratio/low_mean": 5.253966105556174e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.502632390947838e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15529.0,
+      "completions/max_terminated_length": 15529.0,
+      "completions/mean_length": 5491.7421875,
+      "completions/mean_terminated_length": 5491.7421875,
+      "completions/min_length": 1644.0,
+      "completions/min_terminated_length": 1644.0,
+      "entropy": 0.6960643380880356,
+      "epoch": 0.5722171113155474,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005836677737534046,
+      "learning_rate": 1e-05,
+      "loss": 0.1013,
+      "num_tokens": 547676024.0,
+      "reward": 0.5625,
+      "reward_std": 0.43213340640068054,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999930739402771,
+      "sampling/importance_sampling_ratio/min": 0.00043176248436793685,
+      "sampling/sampling_logp_difference/max": 7.7476348876953125,
+      "sampling/sampling_logp_difference/mean": 0.016565188765525818,
+      "step": 622
+    },
+    {
+      "clip_ratio/high_max": 4.318982973927632e-06,
+      "clip_ratio/high_mean": 1.079745743481908e-06,
+      "clip_ratio/low_mean": 3.0399249226320535e-05,
+      "clip_ratio/low_min": 5.838393462909153e-06,
+      "clip_ratio/region_mean": 3.147899496980244e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16179.0,
+      "completions/mean_length": 6993.125,
+      "completions/mean_terminated_length": 6844.06396484375,
+      "completions/min_length": 980.0,
+      "completions/min_terminated_length": 980.0,
+      "entropy": 0.8031502217054367,
+      "epoch": 0.5731370745170193,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.00226933928206563,
+      "learning_rate": 1e-05,
+      "loss": 0.0326,
+      "num_tokens": 548590080.0,
+      "reward": 0.3984375,
+      "reward_std": 0.19332444667816162,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000011682510376,
+      "sampling/importance_sampling_ratio/min": 1.1417677114877733e-06,
+      "sampling/sampling_logp_difference/max": 13.68293285369873,
+      "sampling/sampling_logp_difference/mean": 0.01880657486617565,
+      "step": 623
+    },
+    {
+      "clip_ratio/high_max": 8.404208529100288e-06,
+      "clip_ratio/high_mean": 2.101052132275072e-06,
+      "clip_ratio/low_mean": 4.231840989632474e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.441946202859981e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15278.0,
+      "completions/max_terminated_length": 15278.0,
+      "completions/mean_length": 5602.8359375,
+      "completions/mean_terminated_length": 5602.8359375,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "entropy": 0.8287182524800301,
+      "epoch": 0.5740570377184913,
+      "frac_reward_zero_std": 0.1875,
+      "grad_norm": 0.005067484453320503,
+      "learning_rate": 1e-05,
+      "loss": 0.0394,
+      "num_tokens": 549327251.0,
+      "reward": 0.5,
+      "reward_std": 0.35218530893325806,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999701380729675,
+      "sampling/importance_sampling_ratio/min": 0.0036069792695343494,
+      "sampling/sampling_logp_difference/max": 5.624884605407715,
+      "sampling/sampling_logp_difference/mean": 0.018545404076576233,
+      "step": 624
+    },
+    {
+      "clip_ratio/high_max": 7.49742275729659e-06,
+      "clip_ratio/high_mean": 1.8743556893241475e-06,
+      "clip_ratio/low_mean": 4.6288066641864134e-05,
+      "clip_ratio/low_min": 5.32640206074575e-06,
+      "clip_ratio/region_mean": 4.816242244487512e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15901.0,
+      "completions/mean_length": 6747.0234375,
+      "completions/mean_terminated_length": 6671.1416015625,
+      "completions/min_length": 879.0,
+      "completions/min_terminated_length": 879.0,
+      "entropy": 0.8722762316465378,
+      "epoch": 0.5749770009199632,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0023132911883294582,
+      "learning_rate": 1e-05,
+      "loss": 0.0064,
+      "num_tokens": 550208750.0,
+      "reward": 0.390625,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999475479125977,
+      "sampling/importance_sampling_ratio/min": 0.003727440955117345,
+      "sampling/sampling_logp_difference/max": 5.592033386230469,
+      "sampling/sampling_logp_difference/mean": 0.019216621294617653,
+      "step": 625
+    },
+    {
+      "clip_ratio/high_max": 7.693567567912396e-06,
+      "clip_ratio/high_mean": 1.923391891978099e-06,
+      "clip_ratio/low_mean": 6.517495285152108e-05,
+      "clip_ratio/low_min": 1.1217302017030306e-05,
+      "clip_ratio/region_mean": 6.709834497087286e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16027.0,
+      "completions/max_terminated_length": 16027.0,
+      "completions/mean_length": 6983.40625,
+      "completions/mean_terminated_length": 6983.40625,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 0.8781512826681137,
+      "epoch": 0.5758969641214351,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0036700034979730844,
+      "learning_rate": 1e-05,
+      "loss": 0.0905,
+      "num_tokens": 551123002.0,
+      "reward": 0.328125,
+      "reward_std": 0.2419992983341217,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999868273735046,
+      "sampling/importance_sampling_ratio/min": 5.0360464229015633e-05,
+      "sampling/sampling_logp_difference/max": 9.8963041305542,
+      "sampling/sampling_logp_difference/mean": 0.019318291917443275,
+      "step": 626
+    },
+    {
+      "clip_ratio/high_max": 5.098295332572889e-06,
+      "clip_ratio/high_mean": 1.2745738331432221e-06,
+      "clip_ratio/low_mean": 5.9073974398415885e-05,
+      "clip_ratio/low_min": 6.781316187698394e-06,
+      "clip_ratio/region_mean": 6.034854845893278e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16201.0,
+      "completions/mean_length": 7143.671875,
+      "completions/mean_terminated_length": 6689.22900390625,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.7715872526168823,
+      "epoch": 0.5768169273229071,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0036717690527439117,
+      "learning_rate": 1e-05,
+      "loss": 0.0268,
+      "num_tokens": 552055472.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2212003767490387,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998798966407776,
+      "sampling/importance_sampling_ratio/min": 0.00012340980174485594,
+      "sampling/sampling_logp_difference/max": 9.0,
+      "sampling/sampling_logp_difference/mean": 0.018518533557653427,
+      "step": 627
+    },
+    {
+      "clip_ratio/high_max": 1.778747127900715e-05,
+      "clip_ratio/high_mean": 4.4468678197517875e-06,
+      "clip_ratio/low_mean": 2.460010267668622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.9046970439594588e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15729.0,
+      "completions/mean_length": 6558.5859375,
+      "completions/mean_terminated_length": 6075.36865234375,
+      "completions/min_length": 1061.0,
+      "completions/min_terminated_length": 1061.0,
+      "entropy": 0.9016438648104668,
+      "epoch": 0.577736890524379,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0019187588477507234,
+      "learning_rate": 1e-05,
+      "loss": 0.0494,
+      "num_tokens": 552914275.0,
+      "reward": 0.484375,
+      "reward_std": 0.2041158676147461,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999418258666992,
+      "sampling/importance_sampling_ratio/min": 0.00011496193474158645,
+      "sampling/sampling_logp_difference/max": 9.07090950012207,
+      "sampling/sampling_logp_difference/mean": 0.01948089525103569,
+      "step": 628
+    },
+    {
+      "clip_ratio/high_max": 1.383282506139949e-05,
+      "clip_ratio/high_mean": 3.4582062653498724e-06,
+      "clip_ratio/low_mean": 4.3287541757308645e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.674574802265852e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15812.0,
+      "completions/max_terminated_length": 15812.0,
+      "completions/mean_length": 6150.2734375,
+      "completions/mean_terminated_length": 6150.2734375,
+      "completions/min_length": 596.0,
+      "completions/min_terminated_length": 596.0,
+      "entropy": 0.8385711833834648,
+      "epoch": 0.578656853725851,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003598993644118309,
+      "learning_rate": 1e-05,
+      "loss": 0.0907,
+      "num_tokens": 553719958.0,
+      "reward": 0.5078125,
+      "reward_std": 0.3022220730781555,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999948740005493,
+      "sampling/importance_sampling_ratio/min": 0.000830297009088099,
+      "sampling/sampling_logp_difference/max": 7.093727111816406,
+      "sampling/sampling_logp_difference/mean": 0.019557828083634377,
+      "step": 629
+    },
+    {
+      "clip_ratio/high_max": 2.668830120455823e-06,
+      "clip_ratio/high_mean": 6.672075301139557e-07,
+      "clip_ratio/low_mean": 1.7461135655594262e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.8128343185708218e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16354.0,
+      "completions/mean_length": 8142.46875,
+      "completions/mean_terminated_length": 7519.16015625,
+      "completions/min_length": 1828.0,
+      "completions/min_terminated_length": 1828.0,
+      "entropy": 0.8508284538984299,
+      "epoch": 0.5795768169273229,
+      "frac_reward_zero_std": 0.6875,
+      "grad_norm": 0.002453390508890152,
+      "learning_rate": 1e-05,
+      "loss": 0.0261,
+      "num_tokens": 554784458.0,
+      "reward": 0.390625,
+      "reward_std": 0.1422954648733139,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999715089797974,
+      "sampling/importance_sampling_ratio/min": 0.0002036939695244655,
+      "sampling/sampling_logp_difference/max": 8.498891830444336,
+      "sampling/sampling_logp_difference/mean": 0.019445519894361496,
+      "step": 630
+    },
+    {
+      "clip_ratio/high_max": 1.9002460248884745e-05,
+      "clip_ratio/high_mean": 4.750615062221186e-06,
+      "clip_ratio/low_mean": 3.1556500402984966e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.630711614732718e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16109.0,
+      "completions/mean_length": 7665.921875,
+      "completions/mean_terminated_length": 7384.693359375,
+      "completions/min_length": 791.0,
+      "completions/min_terminated_length": 791.0,
+      "entropy": 0.7667205557227135,
+      "epoch": 0.5804967801287948,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0027936683036386967,
+      "learning_rate": 1e-05,
+      "loss": 0.0245,
+      "num_tokens": 555783296.0,
+      "reward": 0.4296875,
+      "reward_std": 0.24435830116271973,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998488426208496,
+      "sampling/importance_sampling_ratio/min": 0.0002781523216981441,
+      "sampling/sampling_logp_difference/max": 8.187341690063477,
+      "sampling/sampling_logp_difference/mean": 0.01912892609834671,
+      "step": 631
+    },
+    {
+      "clip_ratio/high_max": 1.5569996094200178e-05,
+      "clip_ratio/high_mean": 3.8924990235500445e-06,
+      "clip_ratio/low_mean": 3.8605214058407e-05,
+      "clip_ratio/low_min": 6.2870940382708795e-06,
+      "clip_ratio/region_mean": 4.249771222930576e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16379.0,
+      "completions/mean_length": 7266.171875,
+      "completions/mean_terminated_length": 6972.04833984375,
+      "completions/min_length": 1117.0,
+      "completions/min_terminated_length": 1117.0,
+      "entropy": 0.7114122956991196,
+      "epoch": 0.5814167433302668,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004213637672364712,
+      "learning_rate": 1e-05,
+      "loss": 0.1028,
+      "num_tokens": 556732942.0,
+      "reward": 0.5390625,
+      "reward_std": 0.3135277032852173,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999159574508667,
+      "sampling/importance_sampling_ratio/min": 1.760348027346481e-06,
+      "sampling/sampling_logp_difference/max": 13.249999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01689826510846615,
+      "step": 632
+    },
+    {
+      "clip_ratio/high_max": 2.1737864472015644e-05,
+      "clip_ratio/high_mean": 5.434466118003911e-06,
+      "clip_ratio/low_mean": 3.640393322257296e-05,
+      "clip_ratio/low_min": 3.0146634344419e-06,
+      "clip_ratio/region_mean": 4.183839985216764e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16054.0,
+      "completions/mean_length": 6532.9921875,
+      "completions/mean_terminated_length": 6296.568359375,
+      "completions/min_length": 757.0,
+      "completions/min_terminated_length": 757.0,
+      "entropy": 0.7711968123912811,
+      "epoch": 0.5823367065317387,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004169877618551254,
+      "learning_rate": 1e-05,
+      "loss": 0.0406,
+      "num_tokens": 557589141.0,
+      "reward": 0.546875,
+      "reward_std": 0.2675113081932068,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999022483825684,
+      "sampling/importance_sampling_ratio/min": 4.499705482885474e-06,
+      "sampling/sampling_logp_difference/max": 12.311498641967773,
+      "sampling/sampling_logp_difference/mean": 0.018738210201263428,
+      "step": 633
+    },
+    {
+      "clip_ratio/high_max": 6.099523716329713e-06,
+      "clip_ratio/high_mean": 1.5248809290824283e-06,
+      "clip_ratio/low_mean": 6.070675681257853e-05,
+      "clip_ratio/low_min": 5.175126261747209e-06,
+      "clip_ratio/region_mean": 6.223163745744387e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16337.0,
+      "completions/mean_length": 7384.3203125,
+      "completions/mean_terminated_length": 7168.328125,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "entropy": 0.8054972141981125,
+      "epoch": 0.5832566697332107,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0032470994628965855,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 558557286.0,
+      "reward": 0.4140625,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999680519104004,
+      "sampling/importance_sampling_ratio/min": 0.00019634375348687172,
+      "sampling/sampling_logp_difference/max": 8.535643577575684,
+      "sampling/sampling_logp_difference/mean": 0.019018521532416344,
+      "step": 634
+    },
+    {
+      "clip_ratio/high_max": 4.436853964762122e-05,
+      "clip_ratio/high_mean": 1.1092134911905305e-05,
+      "clip_ratio/low_mean": 3.798940008437057e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.908153437099827e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15918.0,
+      "completions/mean_length": 6131.9453125,
+      "completions/mean_terminated_length": 6051.22021484375,
+      "completions/min_length": 820.0,
+      "completions/min_terminated_length": 820.0,
+      "entropy": 0.8365718051791191,
+      "epoch": 0.5841766329346826,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004848263692110777,
+      "learning_rate": 1e-05,
+      "loss": 0.1247,
+      "num_tokens": 559364639.0,
+      "reward": 0.5625,
+      "reward_std": 0.27328526973724365,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000056266784668,
+      "sampling/importance_sampling_ratio/min": 5.424115443020128e-06,
+      "sampling/sampling_logp_difference/max": 12.124655723571777,
+      "sampling/sampling_logp_difference/mean": 0.018360167741775513,
+      "step": 635
+    },
+    {
+      "clip_ratio/high_max": 1.9398633412492927e-05,
+      "clip_ratio/high_mean": 4.849658353123232e-06,
+      "clip_ratio/low_mean": 2.7543567512111622e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.239322609260853e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15724.0,
+      "completions/max_terminated_length": 15724.0,
+      "completions/mean_length": 5746.8828125,
+      "completions/mean_terminated_length": 5746.8828125,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "entropy": 0.6247628927230835,
+      "epoch": 0.5850965961361545,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003403177484869957,
+      "learning_rate": 1e-05,
+      "loss": 0.0279,
+      "num_tokens": 560119248.0,
+      "reward": 0.5390625,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999486207962036,
+      "sampling/importance_sampling_ratio/min": 6.475952432083432e-07,
+      "sampling/sampling_logp_difference/max": 14.25,
+      "sampling/sampling_logp_difference/mean": 0.015006184577941895,
+      "step": 636
+    },
+    {
+      "clip_ratio/high_max": 2.857848289750109e-05,
+      "clip_ratio/high_mean": 8.111364707019675e-06,
+      "clip_ratio/low_mean": 4.927243321617425e-05,
+      "clip_ratio/low_min": 5.929088274569949e-06,
+      "clip_ratio/region_mean": 5.738379809372418e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16209.0,
+      "completions/mean_length": 7313.7890625,
+      "completions/mean_terminated_length": 7096.1044921875,
+      "completions/min_length": 1068.0,
+      "completions/min_terminated_length": 1068.0,
+      "entropy": 0.8606570512056351,
+      "epoch": 0.5860165593376265,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004058506805449724,
+      "learning_rate": 1e-05,
+      "loss": 0.093,
+      "num_tokens": 561072493.0,
+      "reward": 0.375,
+      "reward_std": 0.3079911172389984,
+      "rewards/accuracy_reward/mean": 0.375,
+      "rewards/accuracy_reward/std": 0.4860251843929291,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999264478683472,
+      "sampling/importance_sampling_ratio/min": 0.0006621598731726408,
+      "sampling/sampling_logp_difference/max": 7.320003509521484,
+      "sampling/sampling_logp_difference/mean": 0.01940958946943283,
+      "step": 637
+    },
+    {
+      "clip_ratio/high_max": 2.7213282010052353e-05,
+      "clip_ratio/high_mean": 7.758043807370996e-06,
+      "clip_ratio/low_mean": 4.890350828645751e-05,
+      "clip_ratio/low_min": 3.968002147303196e-06,
+      "clip_ratio/region_mean": 5.666155129802064e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16093.0,
+      "completions/mean_length": 7495.5078125,
+      "completions/mean_terminated_length": 7425.51953125,
+      "completions/min_length": 882.0,
+      "completions/min_terminated_length": 882.0,
+      "entropy": 0.8225502669811249,
+      "epoch": 0.5869365225390984,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.002768489997833967,
+      "learning_rate": 1e-05,
+      "loss": 0.098,
+      "num_tokens": 562048734.0,
+      "reward": 0.3671875,
+      "reward_std": 0.344813734292984,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 1.4612716768169776e-05,
+      "sampling/sampling_logp_difference/max": 11.133618354797363,
+      "sampling/sampling_logp_difference/mean": 0.0189508069306612,
+      "step": 638
+    },
+    {
+      "clip_ratio/high_max": 2.5246594077543705e-05,
+      "clip_ratio/high_mean": 6.311648519385926e-06,
+      "clip_ratio/low_mean": 4.9131452101391915e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.544310107552519e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15894.0,
+      "completions/mean_length": 6856.5703125,
+      "completions/mean_terminated_length": 6627.912109375,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "entropy": 0.8542520478367805,
+      "epoch": 0.5878564857405704,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002966079628095031,
+      "learning_rate": 1e-05,
+      "loss": 0.0507,
+      "num_tokens": 562945623.0,
+      "reward": 0.40625,
+      "reward_std": 0.3016803562641144,
+      "rewards/accuracy_reward/mean": 0.40625,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998261332511902,
+      "sampling/importance_sampling_ratio/min": 0.0001795661955839023,
+      "sampling/sampling_logp_difference/max": 8.624966621398926,
+      "sampling/sampling_logp_difference/mean": 0.019664689898490906,
+      "step": 639
+    },
+    {
+      "clip_ratio/high_max": 1.2127683930884814e-05,
+      "clip_ratio/high_mean": 5.316983106240514e-06,
+      "clip_ratio/low_mean": 4.154238490627904e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.685936778514588e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15231.0,
+      "completions/mean_length": 6463.2421875,
+      "completions/mean_terminated_length": 6305.77001953125,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "entropy": 0.8427078947424889,
+      "epoch": 0.5887764489420423,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0021058651618659496,
+      "learning_rate": 1e-05,
+      "loss": 0.0164,
+      "num_tokens": 563789214.0,
+      "reward": 0.3046875,
+      "reward_std": 0.24541424214839935,
+      "rewards/accuracy_reward/mean": 0.3046875,
+      "rewards/accuracy_reward/std": 0.46208351850509644,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998518824577332,
+      "sampling/importance_sampling_ratio/min": 0.00043074542190879583,
+      "sampling/sampling_logp_difference/max": 7.749993324279785,
+      "sampling/sampling_logp_difference/mean": 0.01898353546857834,
+      "step": 640
+    },
+    {
+      "clip_ratio/high_max": 1.2559269862322253e-05,
+      "clip_ratio/high_mean": 3.1398174655805633e-06,
+      "clip_ratio/low_mean": 3.146892504446441e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.4608742623731814e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15232.0,
+      "completions/max_terminated_length": 15232.0,
+      "completions/mean_length": 6140.7734375,
+      "completions/mean_terminated_length": 6140.7734375,
+      "completions/min_length": 780.0,
+      "completions/min_terminated_length": 780.0,
+      "entropy": 0.8800382614135742,
+      "epoch": 0.5896964121435143,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005890186410397291,
+      "learning_rate": 1e-05,
+      "loss": 0.0816,
+      "num_tokens": 564596185.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23486016690731049,
+      "rewards/accuracy_reward/mean": 0.4765625,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998830556869507,
+      "sampling/importance_sampling_ratio/min": 0.000808614946436137,
+      "sampling/sampling_logp_difference/max": 7.120187759399414,
+      "sampling/sampling_logp_difference/mean": 0.01930009014904499,
+      "step": 641
+    },
+    {
+      "clip_ratio/high_max": 5.099334885017015e-06,
+      "clip_ratio/high_mean": 1.2748337212542538e-06,
+      "clip_ratio/low_mean": 4.3151162458343606e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.442599617959786e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16246.0,
+      "completions/mean_length": 6361.703125,
+      "completions/mean_terminated_length": 6202.61962890625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.8246701806783676,
+      "epoch": 0.5906163753449862,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003226465079933405,
+      "learning_rate": 1e-05,
+      "loss": -0.0094,
+      "num_tokens": 565430387.0,
+      "reward": 0.359375,
+      "reward_std": 0.2682726979255676,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999127984046936,
+      "sampling/importance_sampling_ratio/min": 0.004490039311349392,
+      "sampling/sampling_logp_difference/max": 5.405893802642822,
+      "sampling/sampling_logp_difference/mean": 0.019014433026313782,
+      "step": 642
+    },
+    {
+      "clip_ratio/high_max": 2.8547008014356834e-05,
+      "clip_ratio/high_mean": 7.822751001640427e-06,
+      "clip_ratio/low_mean": 3.808748408573592e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.591023491684609e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 16283.0,
+      "completions/max_terminated_length": 16283.0,
+      "completions/mean_length": 7363.5234375,
+      "completions/mean_terminated_length": 7363.5234375,
+      "completions/min_length": 706.0,
+      "completions/min_terminated_length": 706.0,
+      "entropy": 0.828450471162796,
+      "epoch": 0.5915363385464582,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003077681176364422,
+      "learning_rate": 1e-05,
+      "loss": 0.0597,
+      "num_tokens": 566393214.0,
+      "reward": 0.4453125,
+      "reward_std": 0.24830512702465057,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.000014066696167,
+      "sampling/importance_sampling_ratio/min": 3.297756165920873e-07,
+      "sampling/sampling_logp_difference/max": 14.924853324890137,
+      "sampling/sampling_logp_difference/mean": 0.01871068961918354,
+      "step": 643
+    },
+    {
+      "clip_ratio/high_max": 4.856254690821515e-06,
+      "clip_ratio/high_mean": 1.2140636727053788e-06,
+      "clip_ratio/low_mean": 1.9775024611590197e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.098908817060874e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16016.0,
+      "completions/mean_length": 6883.8984375,
+      "completions/mean_terminated_length": 6809.09423828125,
+      "completions/min_length": 830.0,
+      "completions/min_terminated_length": 830.0,
+      "entropy": 0.9114723727107048,
+      "epoch": 0.5924563017479301,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0023631115909665823,
+      "learning_rate": 1e-05,
+      "loss": -0.0326,
+      "num_tokens": 567294697.0,
+      "reward": 0.3359375,
+      "reward_std": 0.22567616403102875,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999625086784363,
+      "sampling/importance_sampling_ratio/min": 0.005032482091337442,
+      "sampling/sampling_logp_difference/max": 5.291841983795166,
+      "sampling/sampling_logp_difference/mean": 0.02030845358967781,
+      "step": 644
+    },
+    {
+      "clip_ratio/high_max": 4.608634753822116e-06,
+      "clip_ratio/high_mean": 1.152158688455529e-06,
+      "clip_ratio/low_mean": 3.9204465110742603e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.035662391288497e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16376.0,
+      "completions/mean_length": 6996.9296875,
+      "completions/mean_terminated_length": 6923.015625,
+      "completions/min_length": 1477.0,
+      "completions/min_terminated_length": 1477.0,
+      "entropy": 0.7864109799265862,
+      "epoch": 0.593376264949402,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.006442595738917589,
+      "learning_rate": 1e-05,
+      "loss": 0.071,
+      "num_tokens": 568210240.0,
+      "reward": 0.390625,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999593496322632,
+      "sampling/importance_sampling_ratio/min": 0.0011364181991666555,
+      "sampling/sampling_logp_difference/max": 6.779873847961426,
+      "sampling/sampling_logp_difference/mean": 0.018702290952205658,
+      "step": 645
+    },
+    {
+      "clip_ratio/high_max": 1.442532902728999e-05,
+      "clip_ratio/high_mean": 5.011521352571435e-06,
+      "clip_ratio/low_mean": 5.24772226526693e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.748874355049338e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16367.0,
+      "completions/mean_length": 6384.5546875,
+      "completions/mean_terminated_length": 6305.81884765625,
+      "completions/min_length": 878.0,
+      "completions/min_terminated_length": 878.0,
+      "entropy": 0.7353173196315765,
+      "epoch": 0.594296228150874,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004090449772775173,
+      "learning_rate": 1e-05,
+      "loss": 0.0495,
+      "num_tokens": 569046727.0,
+      "reward": 0.546875,
+      "reward_std": 0.3266732692718506,
+      "rewards/accuracy_reward/mean": 0.546875,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999207854270935,
+      "sampling/importance_sampling_ratio/min": 0.00038435845635831356,
+      "sampling/sampling_logp_difference/max": 7.8639349937438965,
+      "sampling/sampling_logp_difference/mean": 0.017125204205513,
+      "step": 646
+    },
+    {
+      "clip_ratio/high_max": 1.2007675650238525e-05,
+      "clip_ratio/high_mean": 3.0019189125596313e-06,
+      "clip_ratio/low_mean": 3.2856025427463464e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.585794411264942e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16258.0,
+      "completions/mean_length": 7074.59375,
+      "completions/mean_terminated_length": 6696.29248046875,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "entropy": 0.9198992624878883,
+      "epoch": 0.5952161913523459,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0030447279568761587,
+      "learning_rate": 1e-05,
+      "loss": 0.0076,
+      "num_tokens": 569975323.0,
+      "reward": 0.359375,
+      "reward_std": 0.17176413536071777,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999834299087524,
+      "sampling/importance_sampling_ratio/min": 0.024500105530023575,
+      "sampling/sampling_logp_difference/max": 3.709077835083008,
+      "sampling/sampling_logp_difference/mean": 0.019303584471344948,
+      "step": 647
+    },
+    {
+      "clip_ratio/high_max": 6.353676781145623e-06,
+      "clip_ratio/high_mean": 1.5884191952864057e-06,
+      "clip_ratio/low_mean": 7.121561156964162e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.280403042386752e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16211.0,
+      "completions/mean_length": 8044.2578125,
+      "completions/mean_terminated_length": 7181.52587890625,
+      "completions/min_length": 902.0,
+      "completions/min_terminated_length": 902.0,
+      "entropy": 0.8030193895101547,
+      "epoch": 0.5961361545538179,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004508152138441801,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 571024900.0,
+      "reward": 0.3203125,
+      "reward_std": 0.26698729395866394,
+      "rewards/accuracy_reward/mean": 0.3203125,
+      "rewards/accuracy_reward/std": 0.4684300124645233,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999692440032959,
+      "sampling/importance_sampling_ratio/min": 3.98563061025925e-05,
+      "sampling/sampling_logp_difference/max": 10.130229949951172,
+      "sampling/sampling_logp_difference/mean": 0.018804769963026047,
+      "step": 648
+    },
+    {
+      "clip_ratio/high_max": 6.815517735958565e-06,
+      "clip_ratio/high_mean": 1.7038794339896413e-06,
+      "clip_ratio/low_mean": 3.612134810282441e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.7825227536814054e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15903.0,
+      "completions/mean_length": 8451.7578125,
+      "completions/mean_terminated_length": 7922.94189453125,
+      "completions/min_length": 813.0,
+      "completions/min_terminated_length": 813.0,
+      "entropy": 1.008152723312378,
+      "epoch": 0.5970561177552898,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.003926917444914579,
+      "learning_rate": 1e-05,
+      "loss": 0.0265,
+      "num_tokens": 572125141.0,
+      "reward": 0.203125,
+      "reward_std": 0.19226360321044922,
+      "rewards/accuracy_reward/mean": 0.203125,
+      "rewards/accuracy_reward/std": 0.40390563011169434,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999009370803833,
+      "sampling/importance_sampling_ratio/min": 8.862401301712453e-08,
+      "sampling/sampling_logp_difference/max": 16.238862991333008,
+      "sampling/sampling_logp_difference/mean": 0.021555956453084946,
+      "step": 649
+    },
+    {
+      "clip_ratio/high_max": 1.5184358971964684e-05,
+      "clip_ratio/high_mean": 3.796089742991171e-06,
+      "clip_ratio/low_mean": 5.86272076361638e-05,
+      "clip_ratio/low_min": 1.1987166999460896e-05,
+      "clip_ratio/region_mean": 6.242329754968523e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16120.0,
+      "completions/mean_length": 7011.8203125,
+      "completions/mean_terminated_length": 6786.88818359375,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.8761812150478363,
+      "epoch": 0.5979760809567617,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0036475847009569407,
+      "learning_rate": 1e-05,
+      "loss": 0.0367,
+      "num_tokens": 573041934.0,
+      "reward": 0.3984375,
+      "reward_std": 0.31010788679122925,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999783039093018,
+      "sampling/importance_sampling_ratio/min": 3.535783980623819e-05,
+      "sampling/sampling_logp_difference/max": 10.249990463256836,
+      "sampling/sampling_logp_difference/mean": 0.02046291157603264,
+      "step": 650
+    },
+    {
+      "clip_ratio/high_max": 1.0979118769682827e-05,
+      "clip_ratio/high_mean": 2.744779692420707e-06,
+      "clip_ratio/low_mean": 4.855269958170538e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.129747910359583e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15510.0,
+      "completions/mean_length": 7665.7421875,
+      "completions/mean_terminated_length": 7161.3798828125,
+      "completions/min_length": 816.0,
+      "completions/min_terminated_length": 816.0,
+      "entropy": 0.7933268994092941,
+      "epoch": 0.5988960441582337,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038963130209594965,
+      "learning_rate": 1e-05,
+      "loss": 0.0678,
+      "num_tokens": 574040917.0,
+      "reward": 0.453125,
+      "reward_std": 0.3169426918029785,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999545812606812,
+      "sampling/importance_sampling_ratio/min": 1.5536705859631184e-06,
+      "sampling/sampling_logp_difference/max": 13.374890327453613,
+      "sampling/sampling_logp_difference/mean": 0.01943662390112877,
+      "step": 651
+    },
+    {
+      "clip_ratio/high_max": 9.610412234906107e-06,
+      "clip_ratio/high_mean": 3.893257598974742e-06,
+      "clip_ratio/low_mean": 2.4625115656817798e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8518373483166215e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16049.0,
+      "completions/mean_length": 7966.828125,
+      "completions/mean_terminated_length": 7695.30615234375,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 0.8473240435123444,
+      "epoch": 0.5998160073597056,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0030520735308527946,
+      "learning_rate": 1e-05,
+      "loss": 0.0418,
+      "num_tokens": 575078695.0,
+      "reward": 0.2734375,
+      "reward_std": 0.19332443177700043,
+      "rewards/accuracy_reward/mean": 0.2734375,
+      "rewards/accuracy_reward/std": 0.447474867105484,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000214576721191,
+      "sampling/importance_sampling_ratio/min": 0.00038126588333398104,
+      "sampling/sampling_logp_difference/max": 7.872013568878174,
+      "sampling/sampling_logp_difference/mean": 0.0197810810059309,
+      "step": 652
+    },
+    {
+      "clip_ratio/high_max": 4.0985580199048854e-05,
+      "clip_ratio/high_mean": 1.0246395049762214e-05,
+      "clip_ratio/low_mean": 3.762348410418781e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7869878471829e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15116.0,
+      "completions/max_terminated_length": 15116.0,
+      "completions/mean_length": 6384.53125,
+      "completions/mean_terminated_length": 6384.53125,
+      "completions/min_length": 1045.0,
+      "completions/min_terminated_length": 1045.0,
+      "entropy": 0.9130589440464973,
+      "epoch": 0.6007359705611776,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0029330148827284575,
+      "learning_rate": 1e-05,
+      "loss": 0.1305,
+      "num_tokens": 575915163.0,
+      "reward": 0.484375,
+      "reward_std": 0.2885475754737854,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999364614486694,
+      "sampling/importance_sampling_ratio/min": 0.0001401908230036497,
+      "sampling/sampling_logp_difference/max": 8.872506141662598,
+      "sampling/sampling_logp_difference/mean": 0.019899431616067886,
+      "step": 653
+    },
+    {
+      "clip_ratio/high_max": 4.804920081369346e-06,
+      "clip_ratio/high_mean": 1.2012300203423365e-06,
+      "clip_ratio/low_mean": 4.3348386952857254e-05,
+      "clip_ratio/low_min": 3.435481630731374e-06,
+      "clip_ratio/region_mean": 4.454961697319959e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14716.0,
+      "completions/mean_length": 7484.140625,
+      "completions/mean_terminated_length": 7414.06298828125,
+      "completions/min_length": 745.0,
+      "completions/min_terminated_length": 745.0,
+      "entropy": 0.8762720301747322,
+      "epoch": 0.6016559337626495,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0037648119032382965,
+      "learning_rate": 1e-05,
+      "loss": 0.0328,
+      "num_tokens": 576895261.0,
+      "reward": 0.3125,
+      "reward_std": 0.2987973093986511,
+      "rewards/accuracy_reward/mean": 0.3125,
+      "rewards/accuracy_reward/std": 0.4653336703777313,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999864101409912,
+      "sampling/importance_sampling_ratio/min": 0.0002691639238037169,
+      "sampling/sampling_logp_difference/max": 8.220190048217773,
+      "sampling/sampling_logp_difference/mean": 0.020455794408917427,
+      "step": 654
+    },
+    {
+      "clip_ratio/high_max": 2.329104518139502e-05,
+      "clip_ratio/high_mean": 5.822761295348755e-06,
+      "clip_ratio/low_mean": 5.7342298759976984e-05,
+      "clip_ratio/low_min": 1.5017260921013076e-05,
+      "clip_ratio/region_mean": 6.316505982795206e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15614.0,
+      "completions/mean_length": 7483.8671875,
+      "completions/mean_terminated_length": 7196.76611328125,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "entropy": 0.8481424525380135,
+      "epoch": 0.6025758969641214,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022230292670428753,
+      "learning_rate": 1e-05,
+      "loss": 0.0874,
+      "num_tokens": 577874916.0,
+      "reward": 0.453125,
+      "reward_std": 0.3322049677371979,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000191926956177,
+      "sampling/importance_sampling_ratio/min": 0.002037918195128441,
+      "sampling/sampling_logp_difference/max": 6.195826530456543,
+      "sampling/sampling_logp_difference/mean": 0.019235530868172646,
+      "step": 655
+    },
+    {
+      "clip_ratio/high_max": 8.201095170079498e-06,
+      "clip_ratio/high_mean": 2.0502737925198744e-06,
+      "clip_ratio/low_mean": 3.113216860128887e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.318244205274823e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15621.0,
+      "completions/mean_length": 6618.34375,
+      "completions/mean_terminated_length": 6541.44873046875,
+      "completions/min_length": 563.0,
+      "completions/min_terminated_length": 563.0,
+      "entropy": 0.8699518665671349,
+      "epoch": 0.6034958601655934,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003690029727295041,
+      "learning_rate": 1e-05,
+      "loss": 0.0249,
+      "num_tokens": 578741608.0,
+      "reward": 0.5390625,
+      "reward_std": 0.22673210501670837,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998915195465088,
+      "sampling/importance_sampling_ratio/min": 2.0027882783324458e-05,
+      "sampling/sampling_logp_difference/max": 10.818385124206543,
+      "sampling/sampling_logp_difference/mean": 0.019522596150636673,
+      "step": 656
+    },
+    {
+      "clip_ratio/high_max": 4.162365712545579e-06,
+      "clip_ratio/high_mean": 1.0405914281363948e-06,
+      "clip_ratio/low_mean": 5.6235591728182044e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.7276183270005276e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16165.0,
+      "completions/mean_length": 6699.6953125,
+      "completions/mean_terminated_length": 6223.41748046875,
+      "completions/min_length": 693.0,
+      "completions/min_terminated_length": 693.0,
+      "entropy": 0.7825306504964828,
+      "epoch": 0.6044158233670653,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004026883281767368,
+      "learning_rate": 1e-05,
+      "loss": 0.0846,
+      "num_tokens": 579617377.0,
+      "reward": 0.4921875,
+      "reward_std": 0.3056321144104004,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9997950792312622,
+      "sampling/importance_sampling_ratio/min": 4.181192991836724e-07,
+      "sampling/sampling_logp_difference/max": 14.687499046325684,
+      "sampling/sampling_logp_difference/mean": 0.018191896378993988,
+      "step": 657
+    },
+    {
+      "clip_ratio/high_max": 2.1518610083148815e-05,
+      "clip_ratio/high_mean": 5.379652520787204e-06,
+      "clip_ratio/low_mean": 3.858270270029607e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.396235544845695e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15406.0,
+      "completions/max_terminated_length": 15406.0,
+      "completions/mean_length": 5984.875,
+      "completions/mean_terminated_length": 5984.875,
+      "completions/min_length": 1404.0,
+      "completions/min_terminated_length": 1404.0,
+      "entropy": 0.8239431977272034,
+      "epoch": 0.6053357865685373,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004194674547761679,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 580402633.0,
+      "reward": 0.484375,
+      "reward_std": 0.3066929578781128,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999486804008484,
+      "sampling/importance_sampling_ratio/min": 0.003183862892910838,
+      "sampling/sampling_logp_difference/max": 5.749660015106201,
+      "sampling/sampling_logp_difference/mean": 0.019084136933088303,
+      "step": 658
+    },
+    {
+      "clip_ratio/high_max": 2.6722831307779416e-05,
+      "clip_ratio/high_mean": 6.680707826944854e-06,
+      "clip_ratio/low_mean": 5.0344978262728546e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.702568614651682e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15690.0,
+      "completions/mean_length": 5950.5703125,
+      "completions/mean_terminated_length": 5784.96044921875,
+      "completions/min_length": 1140.0,
+      "completions/min_terminated_length": 1140.0,
+      "entropy": 0.8884857445955276,
+      "epoch": 0.6062557497700092,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005016419570893049,
+      "learning_rate": 1e-05,
+      "loss": -0.001,
+      "num_tokens": 581187586.0,
+      "reward": 0.46875,
+      "reward_std": 0.2306838035583496,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999349117279053,
+      "sampling/importance_sampling_ratio/min": 0.002998600946739316,
+      "sampling/sampling_logp_difference/max": 5.809609413146973,
+      "sampling/sampling_logp_difference/mean": 0.01908070594072342,
+      "step": 659
+    },
+    {
+      "clip_ratio/high_max": 8.678353879076894e-06,
+      "clip_ratio/high_mean": 2.1695884697692236e-06,
+      "clip_ratio/low_mean": 2.6390790822006238e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.8560379291775462e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16101.0,
+      "completions/mean_length": 8029.7734375,
+      "completions/mean_terminated_length": 7690.17041015625,
+      "completions/min_length": 1584.0,
+      "completions/min_terminated_length": 1584.0,
+      "entropy": 0.858074463903904,
+      "epoch": 0.6071757129714811,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0035609283950179815,
+      "learning_rate": 1e-05,
+      "loss": 0.0718,
+      "num_tokens": 582236557.0,
+      "reward": 0.3828125,
+      "reward_std": 0.2409384548664093,
+      "rewards/accuracy_reward/mean": 0.3828125,
+      "rewards/accuracy_reward/std": 0.4879830479621887,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999931275844574,
+      "sampling/importance_sampling_ratio/min": 0.005219157785177231,
+      "sampling/sampling_logp_difference/max": 5.2554192543029785,
+      "sampling/sampling_logp_difference/mean": 0.01982714608311653,
+      "step": 660
+    },
+    {
+      "clip_ratio/high_max": 2.362454961257754e-05,
+      "clip_ratio/high_mean": 7.522766622969357e-06,
+      "clip_ratio/low_mean": 3.278200858858327e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.030477487049211e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16063.0,
+      "completions/mean_length": 6958.4921875,
+      "completions/mean_terminated_length": 6494.9423828125,
+      "completions/min_length": 904.0,
+      "completions/min_terminated_length": 904.0,
+      "entropy": 0.7957572638988495,
+      "epoch": 0.6080956761729531,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005020176526159048,
+      "learning_rate": 1e-05,
+      "loss": 0.0505,
+      "num_tokens": 583150740.0,
+      "reward": 0.328125,
+      "reward_std": 0.2109457552433014,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999988853931427,
+      "sampling/importance_sampling_ratio/min": 0.022197909653186798,
+      "sampling/sampling_logp_difference/max": 3.8077571392059326,
+      "sampling/sampling_logp_difference/mean": 0.018450919538736343,
+      "step": 661
+    },
+    {
+      "clip_ratio/high_max": 9.535187928122468e-06,
+      "clip_ratio/high_mean": 2.383796982030617e-06,
+      "clip_ratio/low_mean": 4.201903630018933e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.440283305484627e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15856.0,
+      "completions/max_terminated_length": 15856.0,
+      "completions/mean_length": 6810.234375,
+      "completions/mean_terminated_length": 6810.234375,
+      "completions/min_length": 1105.0,
+      "completions/min_terminated_length": 1105.0,
+      "entropy": 0.7868659943342209,
+      "epoch": 0.609015639374425,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.005002971272915602,
+      "learning_rate": 1e-05,
+      "loss": 0.0826,
+      "num_tokens": 584044250.0,
+      "reward": 0.5390625,
+      "reward_std": 0.22225630283355713,
+      "rewards/accuracy_reward/mean": 0.5390625,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999277591705322,
+      "sampling/importance_sampling_ratio/min": 2.1590203687082976e-05,
+      "sampling/sampling_logp_difference/max": 10.743270874023438,
+      "sampling/sampling_logp_difference/mean": 0.018436448648571968,
+      "step": 662
+    },
+    {
+      "clip_ratio/high_max": 3.5268151805212256e-05,
+      "clip_ratio/high_mean": 9.566726021148497e-06,
+      "clip_ratio/low_mean": 5.7681085309013724e-05,
+      "clip_ratio/low_min": 4.5418209992931224e-06,
+      "clip_ratio/region_mean": 6.724781314915163e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16291.0,
+      "completions/mean_length": 7106.296875,
+      "completions/mean_terminated_length": 6487.78369140625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.8079892098903656,
+      "epoch": 0.609935602575897,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0021831525955349207,
+      "learning_rate": 1e-05,
+      "loss": 0.1195,
+      "num_tokens": 584971568.0,
+      "reward": 0.5625,
+      "reward_std": 0.32772916555404663,
+      "rewards/accuracy_reward/mean": 0.5625,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 8.157488628057763e-05,
+      "sampling/sampling_logp_difference/max": 9.413989067077637,
+      "sampling/sampling_logp_difference/mean": 0.018681492656469345,
+      "step": 663
+    },
+    {
+      "clip_ratio/high_max": 4.332071557655581e-05,
+      "clip_ratio/high_mean": 1.1574332802410936e-05,
+      "clip_ratio/low_mean": 3.626145735324826e-05,
+      "clip_ratio/low_min": 3.933786501875147e-06,
+      "clip_ratio/region_mean": 4.783579004197236e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16363.0,
+      "completions/mean_length": 7235.046875,
+      "completions/mean_terminated_length": 7089.82568359375,
+      "completions/min_length": 1472.0,
+      "completions/min_terminated_length": 1472.0,
+      "entropy": 0.8041050210595131,
+      "epoch": 0.6108555657773689,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004661369137465954,
+      "learning_rate": 1e-05,
+      "loss": 0.0642,
+      "num_tokens": 585916134.0,
+      "reward": 0.4375,
+      "reward_std": 0.322716623544693,
+      "rewards/accuracy_reward/mean": 0.4375,
+      "rewards/accuracy_reward/std": 0.49802759289741516,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000330209732056,
+      "sampling/importance_sampling_ratio/min": 0.0007107750861905515,
+      "sampling/sampling_logp_difference/max": 7.249154567718506,
+      "sampling/sampling_logp_difference/mean": 0.018921509385108948,
+      "step": 664
+    },
+    {
+      "clip_ratio/high_max": 1.4951354842196452e-05,
+      "clip_ratio/high_mean": 3.737838710549113e-06,
+      "clip_ratio/low_mean": 2.6745638365355262e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.0483477416964888e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13888.0,
+      "completions/mean_length": 7077.5859375,
+      "completions/mean_terminated_length": 6777.37890625,
+      "completions/min_length": 944.0,
+      "completions/min_terminated_length": 944.0,
+      "entropy": 0.8417644873261452,
+      "epoch": 0.6117755289788408,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0024479639250785112,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "num_tokens": 586841633.0,
+      "reward": 0.5,
+      "reward_std": 0.2001592218875885,
+      "rewards/accuracy_reward/mean": 0.5,
+      "rewards/accuracy_reward/std": 0.5019646286964417,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998941421508789,
+      "sampling/importance_sampling_ratio/min": 0.00028577001648955047,
+      "sampling/sampling_logp_difference/max": 8.160323143005371,
+      "sampling/sampling_logp_difference/mean": 0.019227145239710808,
+      "step": 665
+    },
+    {
+      "clip_ratio/high_max": 1.7368187855026918e-05,
+      "clip_ratio/high_mean": 5.19675950272358e-06,
+      "clip_ratio/low_mean": 4.123253006582672e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.642928979592398e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15900.0,
+      "completions/mean_length": 8090.3203125,
+      "completions/mean_terminated_length": 7463.0673828125,
+      "completions/min_length": 768.0,
+      "completions/min_terminated_length": 768.0,
+      "entropy": 0.7603196427226067,
+      "epoch": 0.6126954921803128,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005297356750816107,
+      "learning_rate": 1e-05,
+      "loss": 0.0352,
+      "num_tokens": 587897122.0,
+      "reward": 0.2421875,
+      "reward_std": 0.27851754426956177,
+      "rewards/accuracy_reward/mean": 0.2421875,
+      "rewards/accuracy_reward/std": 0.4300905168056488,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999694228172302,
+      "sampling/importance_sampling_ratio/min": 0.0006402728031389415,
+      "sampling/sampling_logp_difference/max": 7.353616237640381,
+      "sampling/sampling_logp_difference/mean": 0.018079372122883797,
+      "step": 666
+    },
+    {
+      "clip_ratio/high_max": 1.5767155673529487e-05,
+      "clip_ratio/high_mean": 3.941788918382372e-06,
+      "clip_ratio/low_mean": 2.9263440183058265e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.3205229101440636e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15514.0,
+      "completions/mean_length": 6908.96875,
+      "completions/mean_terminated_length": 6360.826171875,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.7355617135763168,
+      "epoch": 0.6136154553817847,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003049109596759081,
+      "learning_rate": 1e-05,
+      "loss": 0.0695,
+      "num_tokens": 588801206.0,
+      "reward": 0.515625,
+      "reward_std": 0.25460314750671387,
+      "rewards/accuracy_reward/mean": 0.515625,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999892711639404,
+      "sampling/importance_sampling_ratio/min": 0.0037962812930345535,
+      "sampling/sampling_logp_difference/max": 5.573733329772949,
+      "sampling/sampling_logp_difference/mean": 0.018563130870461464,
+      "step": 667
+    },
+    {
+      "clip_ratio/high_max": 1.725199626889662e-05,
+      "clip_ratio/high_mean": 4.312999067224155e-06,
+      "clip_ratio/low_mean": 6.839358093202463e-05,
+      "clip_ratio/low_min": 9.10438984647044e-06,
+      "clip_ratio/region_mean": 7.27065794308146e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16333.0,
+      "completions/mean_length": 7110.109375,
+      "completions/mean_terminated_length": 6810.951171875,
+      "completions/min_length": 1008.0,
+      "completions/min_terminated_length": 1008.0,
+      "entropy": 0.688617967069149,
+      "epoch": 0.6145354185832567,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0034495368599891663,
+      "learning_rate": 1e-05,
+      "loss": 0.1521,
+      "num_tokens": 589732588.0,
+      "reward": 0.4296875,
+      "reward_std": 0.326668381690979,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999086856842041,
+      "sampling/importance_sampling_ratio/min": 0.000573390512727201,
+      "sampling/sampling_logp_difference/max": 7.4639434814453125,
+      "sampling/sampling_logp_difference/mean": 0.016679491847753525,
+      "step": 668
+    },
+    {
+      "clip_ratio/high_max": 5.049688752478687e-06,
+      "clip_ratio/high_mean": 2.31802277994575e-06,
+      "clip_ratio/low_mean": 5.138145911587344e-05,
+      "clip_ratio/low_min": 3.9801311686460394e-06,
+      "clip_ratio/region_mean": 5.369948189581919e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16263.0,
+      "completions/mean_length": 7533.578125,
+      "completions/mean_terminated_length": 7021.56982421875,
+      "completions/min_length": 1321.0,
+      "completions/min_terminated_length": 1321.0,
+      "entropy": 0.7306379675865173,
+      "epoch": 0.6154553817847286,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004971730522811413,
+      "learning_rate": 1e-05,
+      "loss": 0.0258,
+      "num_tokens": 590717118.0,
+      "reward": 0.390625,
+      "reward_std": 0.30904704332351685,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998904466629028,
+      "sampling/importance_sampling_ratio/min": 4.6860604925313964e-05,
+      "sampling/sampling_logp_difference/max": 9.96833324432373,
+      "sampling/sampling_logp_difference/mean": 0.01741175726056099,
+      "step": 669
+    },
+    {
+      "clip_ratio/high_max": 1.3844989325662027e-05,
+      "clip_ratio/high_mean": 3.4612473314155068e-06,
+      "clip_ratio/low_mean": 4.160707453593204e-05,
+      "clip_ratio/low_min": 7.402582014037762e-06,
+      "clip_ratio/region_mean": 4.506832192419097e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15983.0,
+      "completions/mean_length": 6165.0,
+      "completions/mean_terminated_length": 6002.7939453125,
+      "completions/min_length": 1088.0,
+      "completions/min_terminated_length": 1088.0,
+      "entropy": 0.7227498516440392,
+      "epoch": 0.6163753449862005,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003072877414524555,
+      "learning_rate": 1e-05,
+      "loss": 0.0893,
+      "num_tokens": 591524494.0,
+      "reward": 0.5703125,
+      "reward_std": 0.28353992104530334,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000028610229492,
+      "sampling/importance_sampling_ratio/min": 0.00019289882038719952,
+      "sampling/sampling_logp_difference/max": 8.5533447265625,
+      "sampling/sampling_logp_difference/mean": 0.016893092542886734,
+      "step": 670
+    },
+    {
+      "clip_ratio/high_max": 3.056439982174197e-05,
+      "clip_ratio/high_mean": 8.71779502631398e-06,
+      "clip_ratio/low_mean": 3.8767432329223084e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.7485227241850225e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15451.0,
+      "completions/mean_length": 6061.9375,
+      "completions/mean_terminated_length": 5728.9677734375,
+      "completions/min_length": 973.0,
+      "completions/min_terminated_length": 973.0,
+      "entropy": 0.813653938472271,
+      "epoch": 0.6172953081876725,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003992745652794838,
+      "learning_rate": 1e-05,
+      "loss": 0.0619,
+      "num_tokens": 592320726.0,
+      "reward": 0.578125,
+      "reward_std": 0.22119548916816711,
+      "rewards/accuracy_reward/mean": 0.578125,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999755620956421,
+      "sampling/importance_sampling_ratio/min": 7.489924610126764e-05,
+      "sampling/sampling_logp_difference/max": 9.499366760253906,
+      "sampling/sampling_logp_difference/mean": 0.018718186765909195,
+      "step": 671
+    },
+    {
+      "clip_ratio/high_max": 1.655339747230755e-05,
+      "clip_ratio/high_mean": 4.138349368076888e-06,
+      "clip_ratio/low_mean": 3.851054543702048e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.264889435035002e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16210.0,
+      "completions/mean_length": 7265.9453125,
+      "completions/mean_terminated_length": 6658.0751953125,
+      "completions/min_length": 913.0,
+      "completions/min_terminated_length": 913.0,
+      "entropy": 0.7658502459526062,
+      "epoch": 0.6182152713891444,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.003727070288732648,
+      "learning_rate": 1e-05,
+      "loss": 0.1016,
+      "num_tokens": 593270695.0,
+      "reward": 0.4921875,
+      "reward_std": 0.30327796936035156,
+      "rewards/accuracy_reward/mean": 0.4921875,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999910831451416,
+      "sampling/importance_sampling_ratio/min": 2.014157189478283e-06,
+      "sampling/sampling_logp_difference/max": 13.115309715270996,
+      "sampling/sampling_logp_difference/mean": 0.017805757001042366,
+      "step": 672
+    },
+    {
+      "clip_ratio/high_max": 2.0501698145380942e-05,
+      "clip_ratio/high_mean": 6.335726652650919e-06,
+      "clip_ratio/low_mean": 5.263989112336276e-05,
+      "clip_ratio/low_min": 1.2888257515442092e-05,
+      "clip_ratio/region_mean": 5.897561732126633e-05,
+      "completions/clipped_ratio": 0.1015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16355.0,
+      "completions/mean_length": 8564.046875,
+      "completions/mean_terminated_length": 7680.0517578125,
+      "completions/min_length": 968.0,
+      "completions/min_terminated_length": 968.0,
+      "entropy": 0.6856872886419296,
+      "epoch": 0.6191352345906164,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0038730741944164038,
+      "learning_rate": 1e-05,
+      "loss": 0.0535,
+      "num_tokens": 594386261.0,
+      "reward": 0.4609375,
+      "reward_std": 0.32483339309692383,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999313354492188,
+      "sampling/importance_sampling_ratio/min": 0.00017333027790300548,
+      "sampling/sampling_logp_difference/max": 8.660311698913574,
+      "sampling/sampling_logp_difference/mean": 0.01785116083920002,
+      "step": 673
+    },
+    {
+      "clip_ratio/high_max": 2.6982705094269477e-05,
+      "clip_ratio/high_mean": 8.523603241883393e-06,
+      "clip_ratio/low_mean": 4.970566510564822e-05,
+      "clip_ratio/low_min": 4.473552507988643e-06,
+      "clip_ratio/region_mean": 5.82292680064711e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16349.0,
+      "completions/mean_length": 7838.28125,
+      "completions/mean_terminated_length": 7343.900390625,
+      "completions/min_length": 872.0,
+      "completions/min_terminated_length": 872.0,
+      "entropy": 0.636501632630825,
+      "epoch": 0.6200551977920883,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.004014961421489716,
+      "learning_rate": 1e-05,
+      "loss": 0.0565,
+      "num_tokens": 595407313.0,
+      "reward": 0.46875,
+      "reward_std": 0.3148210048675537,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999966025352478,
+      "sampling/importance_sampling_ratio/min": 9.145037438429426e-07,
+      "sampling/sampling_logp_difference/max": 13.904884338378906,
+      "sampling/sampling_logp_difference/mean": 0.01619477942585945,
+      "step": 674
+    },
+    {
+      "clip_ratio/high_max": 5.649462309520459e-06,
+      "clip_ratio/high_mean": 1.4123655773801147e-06,
+      "clip_ratio/low_mean": 2.8467071842896985e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.98794374202771e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16323.0,
+      "completions/mean_length": 6784.5390625,
+      "completions/mean_terminated_length": 6229.1982421875,
+      "completions/min_length": 906.0,
+      "completions/min_terminated_length": 906.0,
+      "entropy": 0.6435417085886002,
+      "epoch": 0.6209751609935602,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004226911347359419,
+      "learning_rate": 1e-05,
+      "loss": 0.0855,
+      "num_tokens": 596291470.0,
+      "reward": 0.5078125,
+      "reward_std": 0.2409384697675705,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999697208404541,
+      "sampling/importance_sampling_ratio/min": 0.00020356501045171171,
+      "sampling/sampling_logp_difference/max": 8.49952507019043,
+      "sampling/sampling_logp_difference/mean": 0.015974994748830795,
+      "step": 675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 2.2315146964047017e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.2315146964047017e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16122.0,
+      "completions/mean_length": 7650.2265625,
+      "completions/mean_terminated_length": 6989.689453125,
+      "completions/min_length": 1063.0,
+      "completions/min_terminated_length": 1063.0,
+      "entropy": 0.7500722259283066,
+      "epoch": 0.6218951241950322,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0031262668780982494,
+      "learning_rate": 1e-05,
+      "loss": 0.0675,
+      "num_tokens": 597291107.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2012200653553009,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998708963394165,
+      "sampling/importance_sampling_ratio/min": 3.9317012578976573e-07,
+      "sampling/sampling_logp_difference/max": 14.7490234375,
+      "sampling/sampling_logp_difference/mean": 0.01801086962223053,
+      "step": 676
+    },
+    {
+      "clip_ratio/high_max": 2.2775957177145756e-05,
+      "clip_ratio/high_mean": 5.693989294286439e-06,
+      "clip_ratio/low_mean": 5.510050823431811e-05,
+      "clip_ratio/low_min": 4.993807579012355e-06,
+      "clip_ratio/region_mean": 6.079449713070062e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15718.0,
+      "completions/mean_length": 6377.4140625,
+      "completions/mean_terminated_length": 6298.6220703125,
+      "completions/min_length": 478.0,
+      "completions/min_terminated_length": 478.0,
+      "entropy": 0.8221950903534889,
+      "epoch": 0.6228150873965042,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.006345350295305252,
+      "learning_rate": 1e-05,
+      "loss": 0.0759,
+      "num_tokens": 598129568.0,
+      "reward": 0.46875,
+      "reward_std": 0.31929677724838257,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000131130218506,
+      "sampling/importance_sampling_ratio/min": 6.634136661887169e-05,
+      "sampling/sampling_logp_difference/max": 9.620697021484375,
+      "sampling/sampling_logp_difference/mean": 0.01888679713010788,
+      "step": 677
+    },
+    {
+      "clip_ratio/high_max": 2.3920926196296932e-05,
+      "clip_ratio/high_mean": 7.139227250263502e-06,
+      "clip_ratio/low_mean": 5.5144641464721644e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.228386882867198e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14250.0,
+      "completions/mean_length": 5567.2578125,
+      "completions/mean_terminated_length": 5218.33056640625,
+      "completions/min_length": 927.0,
+      "completions/min_terminated_length": 927.0,
+      "entropy": 0.7284790053963661,
+      "epoch": 0.6237350505979761,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.003562809666618705,
+      "learning_rate": 1e-05,
+      "loss": 0.0811,
+      "num_tokens": 598862361.0,
+      "reward": 0.5703125,
+      "reward_std": 0.2698703408241272,
+      "rewards/accuracy_reward/mean": 0.5703125,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999499320983887,
+      "sampling/importance_sampling_ratio/min": 2.3016077932425105e-07,
+      "sampling/sampling_logp_difference/max": 15.2844877243042,
+      "sampling/sampling_logp_difference/mean": 0.016367387026548386,
+      "step": 678
+    },
+    {
+      "clip_ratio/high_max": 1.4490571629721671e-05,
+      "clip_ratio/high_mean": 4.364888013697055e-06,
+      "clip_ratio/low_mean": 2.498499657122011e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.934988481229084e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15202.0,
+      "completions/mean_length": 8128.375,
+      "completions/mean_terminated_length": 7578.00048828125,
+      "completions/min_length": 1066.0,
+      "completions/min_terminated_length": 1066.0,
+      "entropy": 0.7838430106639862,
+      "epoch": 0.624655013799448,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0031477995216846466,
+      "learning_rate": 1e-05,
+      "loss": 0.0517,
+      "num_tokens": 599921233.0,
+      "reward": 0.34375,
+      "reward_std": 0.21542152762413025,
+      "rewards/accuracy_reward/mean": 0.34375,
+      "rewards/accuracy_reward/std": 0.47682511806488037,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999460577964783,
+      "sampling/importance_sampling_ratio/min": 0.00018987487419508398,
+      "sampling/sampling_logp_difference/max": 8.569145202636719,
+      "sampling/sampling_logp_difference/mean": 0.019213391467928886,
+      "step": 679
+    },
+    {
+      "clip_ratio/high_max": 2.650051692398847e-05,
+      "clip_ratio/high_mean": 8.023214263630507e-06,
+      "clip_ratio/low_mean": 3.322141196804296e-05,
+      "clip_ratio/low_min": 2.5509161787340418e-06,
+      "clip_ratio/region_mean": 4.124462532217876e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15879.0,
+      "completions/mean_length": 7452.296875,
+      "completions/mean_terminated_length": 7013.0322265625,
+      "completions/min_length": 799.0,
+      "completions/min_terminated_length": 799.0,
+      "entropy": 0.8657966181635857,
+      "epoch": 0.62557497700092,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0034168637357652187,
+      "learning_rate": 1e-05,
+      "loss": 0.0896,
+      "num_tokens": 600895023.0,
+      "reward": 0.296875,
+      "reward_std": 0.3061561584472656,
+      "rewards/accuracy_reward/mean": 0.296875,
+      "rewards/accuracy_reward/std": 0.45867621898651123,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999901056289673,
+      "sampling/importance_sampling_ratio/min": 0.0003922602627426386,
+      "sampling/sampling_logp_difference/max": 7.843585014343262,
+      "sampling/sampling_logp_difference/mean": 0.019955754280090332,
+      "step": 680
+    },
+    {
+      "clip_ratio/high_max": 8.234628239733865e-06,
+      "clip_ratio/high_mean": 2.0586570599334664e-06,
+      "clip_ratio/low_mean": 5.516502255886735e-05,
+      "clip_ratio/low_min": 5.772084023192292e-06,
+      "clip_ratio/region_mean": 5.7223681096729706e-05,
+      "completions/clipped_ratio": 0.0546875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15759.0,
+      "completions/mean_length": 7581.625,
+      "completions/mean_terminated_length": 7072.396484375,
+      "completions/min_length": 1686.0,
+      "completions/min_terminated_length": 1686.0,
+      "entropy": 0.764233261346817,
+      "epoch": 0.6264949402023919,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0026859277859330177,
+      "learning_rate": 1e-05,
+      "loss": 0.105,
+      "num_tokens": 601887935.0,
+      "reward": 0.421875,
+      "reward_std": 0.3295465111732483,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999804496765137,
+      "sampling/importance_sampling_ratio/min": 0.029503032565116882,
+      "sampling/sampling_logp_difference/max": 3.5232622623443604,
+      "sampling/sampling_logp_difference/mean": 0.018653862178325653,
+      "step": 681
+    },
+    {
+      "clip_ratio/high_max": 2.654059608175885e-05,
+      "clip_ratio/high_mean": 6.635149020439712e-06,
+      "clip_ratio/low_mean": 5.129833289174712e-05,
+      "clip_ratio/low_min": 5.234505806583911e-06,
+      "clip_ratio/region_mean": 5.793348100269213e-05,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16279.0,
+      "completions/mean_length": 8824.2421875,
+      "completions/mean_terminated_length": 8452.4501953125,
+      "completions/min_length": 1991.0,
+      "completions/min_terminated_length": 1991.0,
+      "entropy": 0.7557987719774246,
+      "epoch": 0.6274149034038639,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.002624326851218939,
+      "learning_rate": 1e-05,
+      "loss": 0.0491,
+      "num_tokens": 603035462.0,
+      "reward": 0.328125,
+      "reward_std": 0.2688094973564148,
+      "rewards/accuracy_reward/mean": 0.328125,
+      "rewards/accuracy_reward/std": 0.4713755249977112,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999333024024963,
+      "sampling/importance_sampling_ratio/min": 5.1453887863317505e-05,
+      "sampling/sampling_logp_difference/max": 9.874824523925781,
+      "sampling/sampling_logp_difference/mean": 0.01799936406314373,
+      "step": 682
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 3.1395032920045196e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.1395032920045196e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16189.0,
+      "completions/mean_length": 5832.7890625,
+      "completions/mean_terminated_length": 5749.70849609375,
+      "completions/min_length": 948.0,
+      "completions/min_terminated_length": 948.0,
+      "entropy": 0.8034545630216599,
+      "epoch": 0.6283348666053358,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005783884786069393,
+      "learning_rate": 1e-05,
+      "loss": 0.0796,
+      "num_tokens": 603801083.0,
+      "reward": 0.5234375,
+      "reward_std": 0.27092626690864563,
+      "rewards/accuracy_reward/mean": 0.5234375,
+      "rewards/accuracy_reward/std": 0.5014128684997559,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000687837600708,
+      "sampling/importance_sampling_ratio/min": 0.033374395221471786,
+      "sampling/sampling_logp_difference/max": 3.399966239929199,
+      "sampling/sampling_logp_difference/mean": 0.01805710420012474,
+      "step": 683
+    },
+    {
+      "clip_ratio/high_max": 2.2193052700458793e-05,
+      "clip_ratio/high_mean": 6.736250270478195e-06,
+      "clip_ratio/low_mean": 5.521000275621191e-05,
+      "clip_ratio/low_min": 9.064021924132248e-06,
+      "clip_ratio/region_mean": 6.19462530266901e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16221.0,
+      "completions/mean_length": 7247.4609375,
+      "completions/mean_terminated_length": 7102.43701171875,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.908146396279335,
+      "epoch": 0.6292548298068077,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.005038067698478699,
+      "learning_rate": 1e-05,
+      "loss": 0.0832,
+      "num_tokens": 604748150.0,
+      "reward": 0.46875,
+      "reward_std": 0.43106767535209656,
+      "rewards/accuracy_reward/mean": 0.46875,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999319314956665,
+      "sampling/importance_sampling_ratio/min": 0.0030831864569336176,
+      "sampling/sampling_logp_difference/max": 5.781791687011719,
+      "sampling/sampling_logp_difference/mean": 0.01983889564871788,
+      "step": 684
+    },
+    {
+      "clip_ratio/high_max": 8.630155889477464e-06,
+      "clip_ratio/high_mean": 2.157538972369366e-06,
+      "clip_ratio/low_mean": 6.599987852951017e-05,
+      "clip_ratio/low_min": 1.7551100199852954e-05,
+      "clip_ratio/region_mean": 6.815741778609663e-05,
+      "completions/clipped_ratio": 0.0390625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15519.0,
+      "completions/mean_length": 6861.078125,
+      "completions/mean_terminated_length": 6473.96728515625,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "entropy": 0.7612876370549202,
+      "epoch": 0.6301747930082797,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.0053928992711007595,
+      "learning_rate": 1e-05,
+      "loss": 0.0967,
+      "num_tokens": 605642768.0,
+      "reward": 0.5078125,
+      "reward_std": 0.40503159165382385,
+      "rewards/accuracy_reward/mean": 0.5078125,
+      "rewards/accuracy_reward/std": 0.5019033551216125,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999208450317383,
+      "sampling/importance_sampling_ratio/min": 4.585228089126758e-05,
+      "sampling/sampling_logp_difference/max": 9.99008560180664,
+      "sampling/sampling_logp_difference/mean": 0.018197370693087578,
+      "step": 685
+    },
+    {
+      "clip_ratio/high_max": 2.531879181333352e-05,
+      "clip_ratio/high_mean": 6.32969795333338e-06,
+      "clip_ratio/low_mean": 5.132838714416721e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.765808464275324e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16310.0,
+      "completions/mean_length": 6837.8203125,
+      "completions/mean_terminated_length": 6201.40869140625,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.6217481270432472,
+      "epoch": 0.6310947562097516,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0032709913793951273,
+      "learning_rate": 1e-05,
+      "loss": 0.1155,
+      "num_tokens": 606534577.0,
+      "reward": 0.484375,
+      "reward_std": 0.2567248046398163,
+      "rewards/accuracy_reward/mean": 0.484375,
+      "rewards/accuracy_reward/std": 0.5017194747924805,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999561309814453,
+      "sampling/importance_sampling_ratio/min": 4.2650382965803146e-05,
+      "sampling/sampling_logp_difference/max": 10.062474250793457,
+      "sampling/sampling_logp_difference/mean": 0.016331009566783905,
+      "step": 686
+    },
+    {
+      "clip_ratio/high_max": 1.0992388070008019e-05,
+      "clip_ratio/high_mean": 3.581897317417315e-06,
+      "clip_ratio/low_mean": 5.021198876420385e-05,
+      "clip_ratio/low_min": 4.219409220240777e-06,
+      "clip_ratio/region_mean": 5.379388539950014e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15103.0,
+      "completions/mean_length": 6458.703125,
+      "completions/mean_terminated_length": 6380.55126953125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.7460968196392059,
+      "epoch": 0.6320147194112236,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.002640153281390667,
+      "learning_rate": 1e-05,
+      "loss": 0.0581,
+      "num_tokens": 607381811.0,
+      "reward": 0.4140625,
+      "reward_std": 0.2382800281047821,
+      "rewards/accuracy_reward/mean": 0.4140625,
+      "rewards/accuracy_reward/std": 0.49449479579925537,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000228881835938,
+      "sampling/importance_sampling_ratio/min": 8.858721116666857e-07,
+      "sampling/sampling_logp_difference/max": 13.93669319152832,
+      "sampling/sampling_logp_difference/mean": 0.017693117260932922,
+      "step": 687
+    },
+    {
+      "clip_ratio/high_max": 1.2546400967039517e-05,
+      "clip_ratio/high_mean": 3.1366002417598793e-06,
+      "clip_ratio/low_mean": 6.0473582834674744e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.361018404277274e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16121.0,
+      "completions/mean_length": 7043.1640625,
+      "completions/mean_terminated_length": 6894.8974609375,
+      "completions/min_length": 952.0,
+      "completions/min_terminated_length": 952.0,
+      "entropy": 0.7884078621864319,
+      "epoch": 0.6329346826126955,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.003713687416166067,
+      "learning_rate": 1e-05,
+      "loss": 0.0635,
+      "num_tokens": 608302256.0,
+      "reward": 0.390625,
+      "reward_std": 0.2648528814315796,
+      "rewards/accuracy_reward/mean": 0.390625,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999112486839294,
+      "sampling/importance_sampling_ratio/min": 9.931326871992496e-08,
+      "sampling/sampling_logp_difference/max": 16.12498664855957,
+      "sampling/sampling_logp_difference/mean": 0.019254781305789948,
+      "step": 688
+    },
+    {
+      "clip_ratio/high_max": 7.887592573752045e-06,
+      "clip_ratio/high_mean": 1.971898143438011e-06,
+      "clip_ratio/low_mean": 4.4303845015747356e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.6275743216028786e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15487.0,
+      "completions/mean_length": 8012.8359375,
+      "completions/mean_terminated_length": 7742.79833984375,
+      "completions/min_length": 866.0,
+      "completions/min_terminated_length": 866.0,
+      "entropy": 0.8368816301226616,
+      "epoch": 0.6338546458141674,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.004894682671874762,
+      "learning_rate": 1e-05,
+      "loss": 0.0275,
+      "num_tokens": 609348299.0,
+      "reward": 0.4296875,
+      "reward_std": 0.3027411997318268,
+      "rewards/accuracy_reward/mean": 0.4296875,
+      "rewards/accuracy_reward/std": 0.4969765841960907,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000343322753906,
+      "sampling/importance_sampling_ratio/min": 0.0021496599074453115,
+      "sampling/sampling_logp_difference/max": 6.1424455642700195,
+      "sampling/sampling_logp_difference/mean": 0.01958826184272766,
+      "step": 689
+    },
+    {
+      "clip_ratio/high_max": 1.0690811450331239e-05,
+      "clip_ratio/high_mean": 2.6727028625828098e-06,
+      "clip_ratio/low_mean": 3.859445814669016e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.1267160668212455e-05,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16065.0,
+      "completions/mean_length": 7594.3671875,
+      "completions/mean_terminated_length": 7008.39208984375,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.692665733397007,
+      "epoch": 0.6347746090156394,
+      "frac_reward_zero_std": 0.3125,
+      "grad_norm": 0.0039004215504974127,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "num_tokens": 610341090.0,
+      "reward": 0.3984375,
+      "reward_std": 0.3284856975078583,
+      "rewards/accuracy_reward/mean": 0.3984375,
+      "rewards/accuracy_reward/std": 0.4915000796318054,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999041557312012,
+      "sampling/importance_sampling_ratio/min": 4.006533345091157e-05,
+      "sampling/sampling_logp_difference/max": 10.124999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01734849065542221,
+      "step": 690
+    },
+    {
+      "clip_ratio/high_max": 4.406994776218198e-06,
+      "clip_ratio/high_mean": 2.7999831218039617e-06,
+      "clip_ratio/low_mean": 5.9335616697353544e-05,
+      "clip_ratio/low_min": 5.472375505632954e-06,
+      "clip_ratio/region_mean": 6.21355998191575e-05,
+      "completions/clipped_ratio": 0.03125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16292.0,
+      "completions/mean_length": 7640.09375,
+      "completions/mean_terminated_length": 7358.0322265625,
+      "completions/min_length": 826.0,
+      "completions/min_terminated_length": 826.0,
+      "entropy": 0.8469130471348763,
+      "epoch": 0.6356945722171113,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.004913663491606712,
+      "learning_rate": 1e-05,
+      "loss": 0.0782,
+      "num_tokens": 611339726.0,
+      "reward": 0.359375,
+      "reward_std": 0.3356248140335083,
+      "rewards/accuracy_reward/mean": 0.359375,
+      "rewards/accuracy_reward/std": 0.481702595949173,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998912811279297,
+      "sampling/importance_sampling_ratio/min": 4.2071459205317296e-08,
+      "sampling/sampling_logp_difference/max": 16.983896255493164,
+      "sampling/sampling_logp_difference/mean": 0.019604282453656197,
+      "step": 691
+    },
+    {
+      "clip_ratio/high_max": 1.4971937162044924e-05,
+      "clip_ratio/high_mean": 5.209913979342673e-06,
+      "clip_ratio/low_mean": 2.7830240469484124e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.304015490357415e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 15895.0,
+      "completions/max_terminated_length": 15895.0,
+      "completions/mean_length": 5063.6953125,
+      "completions/mean_terminated_length": 5063.6953125,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 0.7586102113127708,
+      "epoch": 0.6366145354185833,
+      "frac_reward_zero_std": 0.5625,
+      "grad_norm": 0.0032354791183024645,
+      "learning_rate": 1e-05,
+      "loss": 0.0371,
+      "num_tokens": 612005495.0,
+      "reward": 0.59375,
+      "reward_std": 0.18990948796272278,
+      "rewards/accuracy_reward/mean": 0.59375,
+      "rewards/accuracy_reward/std": 0.4930621087551117,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999009370803833,
+      "sampling/importance_sampling_ratio/min": 0.02037520334124565,
+      "sampling/sampling_logp_difference/max": 3.8934366703033447,
+      "sampling/sampling_logp_difference/mean": 0.0178166925907135,
+      "step": 692
+    },
+    {
+      "clip_ratio/high_max": 2.1337797079468146e-05,
+      "clip_ratio/high_mean": 5.3344492698670365e-06,
+      "clip_ratio/low_mean": 1.1576638144106255e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.691108741397329e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14868.0,
+      "completions/mean_length": 6542.1640625,
+      "completions/mean_terminated_length": 6385.94482421875,
+      "completions/min_length": 665.0,
+      "completions/min_terminated_length": 665.0,
+      "entropy": 0.847448967397213,
+      "epoch": 0.6375344986200552,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004039868246763945,
+      "learning_rate": 1e-05,
+      "loss": 0.0357,
+      "num_tokens": 612870060.0,
+      "reward": 0.453125,
+      "reward_std": 0.2590789198875427,
+      "rewards/accuracy_reward/mean": 0.453125,
+      "rewards/accuracy_reward/std": 0.4997538626194,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9998836517333984,
+      "sampling/importance_sampling_ratio/min": 2.2897740994953786e-11,
+      "sampling/sampling_logp_difference/max": 24.499982833862305,
+      "sampling/sampling_logp_difference/mean": 0.019780561327934265,
+      "step": 693
+    },
+    {
+      "clip_ratio/high_max": 6.333826149784727e-06,
+      "clip_ratio/high_mean": 1.5834565374461818e-06,
+      "clip_ratio/low_mean": 3.4833526569855167e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.641698299361451e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16287.0,
+      "completions/mean_length": 5805.8203125,
+      "completions/mean_terminated_length": 5551.9443359375,
+      "completions/min_length": 919.0,
+      "completions/min_terminated_length": 919.0,
+      "entropy": 0.6972410827875137,
+      "epoch": 0.6384544618215271,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0023007066920399666,
+      "learning_rate": 1e-05,
+      "loss": 0.0632,
+      "num_tokens": 613633581.0,
+      "reward": 0.609375,
+      "reward_std": 0.23857943713665009,
+      "rewards/accuracy_reward/mean": 0.609375,
+      "rewards/accuracy_reward/std": 0.4898075461387634,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000262260437012,
+      "sampling/importance_sampling_ratio/min": 0.00026135475491173565,
+      "sampling/sampling_logp_difference/max": 8.249631881713867,
+      "sampling/sampling_logp_difference/mean": 0.016993921250104904,
+      "step": 694
+    },
+    {
+      "clip_ratio/high_max": 6.643952701779199e-06,
+      "clip_ratio/high_mean": 1.6609881754447997e-06,
+      "clip_ratio/low_mean": 1.501361566624837e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 1.667460389853659e-05,
+      "completions/clipped_ratio": 0.09375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16249.0,
+      "completions/mean_length": 7504.65625,
+      "completions/mean_terminated_length": 6586.103515625,
+      "completions/min_length": 843.0,
+      "completions/min_terminated_length": 843.0,
+      "entropy": 0.7908455803990364,
+      "epoch": 0.6393744250229991,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0029130352195352316,
+      "learning_rate": 1e-05,
+      "loss": 0.0413,
+      "num_tokens": 614611881.0,
+      "reward": 0.3671875,
+      "reward_std": 0.23250606656074524,
+      "rewards/accuracy_reward/mean": 0.3671875,
+      "rewards/accuracy_reward/std": 0.4839322865009308,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.999863862991333,
+      "sampling/importance_sampling_ratio/min": 2.025089543167269e-06,
+      "sampling/sampling_logp_difference/max": 13.109896659851074,
+      "sampling/sampling_logp_difference/mean": 0.018666472285985947,
+      "step": 695
+    },
+    {
+      "clip_ratio/high_max": 1.817479960664059e-05,
+      "clip_ratio/high_mean": 4.543699901660148e-06,
+      "clip_ratio/low_mean": 5.670640712196473e-05,
+      "clip_ratio/low_min": 6.148246484372066e-06,
+      "clip_ratio/region_mean": 6.125010668256436e-05,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 13984.0,
+      "completions/max_terminated_length": 13984.0,
+      "completions/mean_length": 5627.265625,
+      "completions/mean_terminated_length": 5627.265625,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "entropy": 0.7167766839265823,
+      "epoch": 0.640294388224471,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.0020515238866209984,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "num_tokens": 615355915.0,
+      "reward": 0.421875,
+      "reward_std": 0.26827272772789,
+      "rewards/accuracy_reward/mean": 0.421875,
+      "rewards/accuracy_reward/std": 0.4957992732524872,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999716877937317,
+      "sampling/importance_sampling_ratio/min": 0.002808797173202038,
+      "sampling/sampling_logp_difference/max": 5.874999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01694992370903492,
+      "step": 696
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 5.3280599786376115e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 5.3280599786376115e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14713.0,
+      "completions/mean_length": 6129.9140625,
+      "completions/mean_terminated_length": 5967.1513671875,
+      "completions/min_length": 1201.0,
+      "completions/min_terminated_length": 1201.0,
+      "entropy": 0.7654511705040932,
+      "epoch": 0.641214351425943,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.003425017697736621,
+      "learning_rate": 1e-05,
+      "loss": 0.0428,
+      "num_tokens": 616159416.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2188364714384079,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999443292617798,
+      "sampling/importance_sampling_ratio/min": 0.005587513092905283,
+      "sampling/sampling_logp_difference/max": 5.187221050262451,
+      "sampling/sampling_logp_difference/mean": 0.01828661933541298,
+      "step": 697
+    },
+    {
+      "clip_ratio/high_max": 2.1838685825059656e-05,
+      "clip_ratio/high_mean": 5.459671456264914e-06,
+      "clip_ratio/low_mean": 3.4785461366482195e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 4.024513225431292e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 16340.0,
+      "completions/mean_length": 7219.078125,
+      "completions/mean_terminated_length": 7146.91357421875,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "entropy": 0.847568191587925,
+      "epoch": 0.6421343146274149,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.005707201547920704,
+      "learning_rate": 1e-05,
+      "loss": 0.0607,
+      "num_tokens": 617101738.0,
+      "reward": 0.53125,
+      "reward_std": 0.2835350036621094,
+      "rewards/accuracy_reward/mean": 0.53125,
+      "rewards/accuracy_reward/std": 0.5009832978248596,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999914765357971,
+      "sampling/importance_sampling_ratio/min": 2.5612937406549463e-06,
+      "sampling/sampling_logp_difference/max": 12.874998092651367,
+      "sampling/sampling_logp_difference/mean": 0.01983051374554634,
+      "step": 698
+    },
+    {
+      "clip_ratio/high_max": 2.676450185390422e-05,
+      "clip_ratio/high_mean": 8.55213056638604e-06,
+      "clip_ratio/low_mean": 5.492671812135086e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 6.347884914248425e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14972.0,
+      "completions/mean_length": 6116.96875,
+      "completions/mean_terminated_length": 5870.56005859375,
+      "completions/min_length": 1371.0,
+      "completions/min_terminated_length": 1371.0,
+      "entropy": 0.7148991823196411,
+      "epoch": 0.6430542778288868,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.004018646199256182,
+      "learning_rate": 1e-05,
+      "loss": 0.078,
+      "num_tokens": 617903030.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2569621503353119,
+      "rewards/accuracy_reward/mean": 0.5546875,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999732971191406,
+      "sampling/importance_sampling_ratio/min": 0.00015846146561671048,
+      "sampling/sampling_logp_difference/max": 8.749999046325684,
+      "sampling/sampling_logp_difference/mean": 0.017638593912124634,
+      "step": 699
+    },
+    {
+      "clip_ratio/high_max": 3.844970706268214e-06,
+      "clip_ratio/high_mean": 1.9004990008397726e-06,
+      "clip_ratio/low_mean": 7.103690825260855e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 7.29374083903167e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15232.0,
+      "completions/mean_length": 7486.515625,
+      "completions/mean_terminated_length": 7272.9765625,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.7912377193570137,
+      "epoch": 0.6439742410303588,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00282766274176538,
+      "learning_rate": 1e-05,
+      "loss": 0.0617,
+      "num_tokens": 618880312.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32089442014694214,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999787211418152,
+      "sampling/importance_sampling_ratio/min": 0.0001030677231028676,
+      "sampling/sampling_logp_difference/max": 9.180124282836914,
+      "sampling/sampling_logp_difference/mean": 0.01940794661641121,
+      "step": 700
+    },
+    {
+      "clip_ratio/high_max": 2.241842275907402e-05,
+      "clip_ratio/high_mean": 6.616161613237637e-06,
+      "clip_ratio/low_mean": 3.103233757428825e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.76484995285864e-05,
+      "completions/clipped_ratio": 0.0703125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 13644.0,
+      "completions/mean_length": 7297.453125,
+      "completions/mean_terminated_length": 6610.23583984375,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.8420139253139496,
+      "epoch": 0.6448942042318307,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.0016839519375935197,
+      "learning_rate": 1e-05,
+      "loss": 0.0438,
+      "num_tokens": 619834002.0,
+      "reward": 0.3359375,
+      "reward_std": 0.2801200747489929,
+      "rewards/accuracy_reward/mean": 0.3359375,
+      "rewards/accuracy_reward/std": 0.47417303919792175,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999658465385437,
+      "sampling/importance_sampling_ratio/min": 0.0005040382966399193,
+      "sampling/sampling_logp_difference/max": 7.59285831451416,
+      "sampling/sampling_logp_difference/mean": 0.019356656819581985,
+      "step": 701
+    },
+    {
+      "clip_ratio/high_max": 9.791850970941596e-06,
+      "clip_ratio/high_mean": 2.447962742735399e-06,
+      "clip_ratio/low_mean": 4.7923438614816405e-05,
+      "clip_ratio/low_min": 3.219243353669299e-06,
+      "clip_ratio/region_mean": 5.0371401357551804e-05,
+      "completions/clipped_ratio": 0.0234375,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15471.0,
+      "completions/mean_length": 5935.53125,
+      "completions/mean_terminated_length": 5684.76806640625,
+      "completions/min_length": 632.0,
+      "completions/min_terminated_length": 632.0,
+      "entropy": 0.6855737417936325,
+      "epoch": 0.6458141674333027,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.00550073804333806,
+      "learning_rate": 1e-05,
+      "loss": 0.0822,
+      "num_tokens": 620615054.0,
+      "reward": 0.4609375,
+      "reward_std": 0.3366856575012207,
+      "rewards/accuracy_reward/mean": 0.4609375,
+      "rewards/accuracy_reward/std": 0.5004304051399231,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 1.0000221729278564,
+      "sampling/importance_sampling_ratio/min": 2.4300854420289397e-05,
+      "sampling/sampling_logp_difference/max": 10.624999046325684,
+      "sampling/sampling_logp_difference/mean": 0.01712688058614731,
+      "step": 702
+    },
+    {
+      "clip_ratio/high_max": 1.3569449947681278e-05,
+      "clip_ratio/high_mean": 3.3923624869203195e-06,
+      "clip_ratio/low_mean": 2.6169475859205704e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 2.95618385734997e-05,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 14395.0,
+      "completions/mean_length": 6016.0625,
+      "completions/mean_terminated_length": 5851.4921875,
+      "completions/min_length": 986.0,
+      "completions/min_terminated_length": 986.0,
+      "entropy": 0.7685846760869026,
+      "epoch": 0.6467341306347746,
+      "frac_reward_zero_std": 0.4375,
+      "grad_norm": 0.005174044985324144,
+      "learning_rate": 1e-05,
+      "loss": 0.0922,
+      "num_tokens": 621407854.0,
+      "reward": 0.4453125,
+      "reward_std": 0.25330984592437744,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999493956565857,
+      "sampling/importance_sampling_ratio/min": 3.535773794283159e-05,
+      "sampling/sampling_logp_difference/max": 10.249993324279785,
+      "sampling/sampling_logp_difference/mean": 0.017704609781503677,
+      "step": 703
+    },
+    {
+      "clip_ratio/high_max": 8.932004220696399e-06,
+      "clip_ratio/high_mean": 2.2330010551740997e-06,
+      "clip_ratio/low_mean": 3.712984198500635e-05,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 3.936284304018045e-05,
+      "completions/clipped_ratio": 0.0078125,
+      "completions/max_length": 16384.0,
+      "completions/max_terminated_length": 15771.0,
+      "completions/mean_length": 6402.3984375,
+      "completions/mean_terminated_length": 6323.80322265625,
+      "completions/min_length": 443.0,
+      "completions/min_terminated_length": 443.0,
+      "entropy": 0.8285454586148262,
+      "epoch": 0.6476540938362465,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.0022989478893578053,
+      "learning_rate": 1e-05,
+      "loss": 0.1083,
+      "num_tokens": 622246633.0,
+      "reward": 0.4453125,
+      "reward_std": 0.32089439034461975,
+      "rewards/accuracy_reward/mean": 0.4453125,
+      "rewards/accuracy_reward/std": 0.4989531338214874,
+      "sampling/importance_sampling_ratio/max": 2.0,
+      "sampling/importance_sampling_ratio/mean": 0.9999479055404663,
+      "sampling/importance_sampling_ratio/min": 4.360687739790592e-07,
+      "sampling/sampling_logp_difference/max": 14.645465850830078,
+      "sampling/sampling_logp_difference/mean": 0.01977568492293358,
+      "step": 704
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1024,
+  "num_input_tokens_seen": 622246633,
+  "num_train_epochs": 1,
+  "save_steps": 64,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-704/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/checkpoint-704/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/dapo_milora_plus_20251201_131939/output.log b/dapo_milora_plus_20251201_131939/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..2697349482dbf063b4861232778ab4639af76e6f
--- /dev/null
+++ b/dapo_milora_plus_20251201_131939/output.log
@@ -0,0 +1,8813 @@
+W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] 
+W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] *****************************************
+W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] *****************************************
+INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda.
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+
+[OpenTinker] 2025-12-01 13:20:26,167 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it
+[OpenTinker] 2025-12-01 13:20:26,167 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it
+[OpenTinker] 2025-12-01 13:20:26,167 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-01 13:20:26,169 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: setting up run ruhht7fc
+wandb: setting up run f7ojo7cc
+wandb: setting up run 56v55mci
+wandb: setting up run 79eq2874
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-79eq2874
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_milora_plus_20251201_131939
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/79eq2874
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-ruhht7fc
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_milora_plus_20251201_131939
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/ruhht7fc
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-f7ojo7cc
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_milora_plus_20251201_131939
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/f7ojo7cc
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-56v55mci
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_milora_plus_20251201_131939
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/56v55mci
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-01 13:20:31,962 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-01 13:20:31,962 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-01 13:20:32,107 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-01 13:20:32,107 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-01 13:20:33,070 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-01 13:20:33,133 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-01 13:20:33,152 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-01 13:20:33,507 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-01 13:20:36,261 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-01 13:20:36,268 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-12-01 13:20:36,448 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+`torch_dtype` is deprecated! Use `dtype` instead!
+[OpenTinker] 2025-12-01 13:20:36,623 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+[OpenTinker] 2025-12-01 13:20:37,520 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-01 13:20:37,520 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-01 13:20:37,635 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-01 13:20:37,635 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-01 13:20:37,706 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization...
+[OpenTinker] 2025-12-01 13:20:37,706 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace
+[OpenTinker] 2025-12-01 13:20:37,749 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-01 13:20:37,749 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-01 13:20:37,791 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization...
+[OpenTinker] 2025-12-01 13:20:37,791 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace
+[OpenTinker] 2025-12-01 13:20:37,897 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-01 13:20:37,897 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-01 13:20:37,902 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization...
+[OpenTinker] 2025-12-01 13:20:37,903 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace
+[OpenTinker] 2025-12-01 13:20:38,050 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization...
+[OpenTinker] 2025-12-01 13:20:38,051 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace
+[OpenTinker] 2025-12-01 13:20:38,417 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:38,448 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:38,478 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:38,513 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:38,545 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:38,575 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:38,630 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:38,662 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:38,692 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:38,763 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:38,795 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:38,825 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:40,266 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:40,387 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:40,512 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:40,621 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:45,341 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:45,556 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:45,736 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:45,798 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:46,049 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:46,080 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:46,111 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:46,273 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:46,303 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:46,334 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:46,446 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:46,477 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:46,507 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:46,524 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:46,554 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:46,586 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:47,887 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:48,130 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:48,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:48,395 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:52,959 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:53,290 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:53,388 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:53,646 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj
+[OpenTinker] 2025-12-01 13:20:53,669 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:53,700 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:53,732 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:54,009 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:54,040 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:54,070 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:54,099 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:54,130 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:54,161 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:54,376 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:20:54,407 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:20:54,438 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:20:55,507 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:55,866 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:55,936 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj
+[OpenTinker] 2025-12-01 13:20:56,245 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:00,574 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:01,031 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:01,034 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:01,289 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:01,320 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:01,349 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:01,516 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:01,745 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:01,751 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:01,776 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:01,782 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:01,806 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:01,814 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:02,245 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:02,276 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:02,306 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:03,142 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:03,596 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:03,625 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:04,123 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:08,214 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:08,687 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:08,795 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:08,926 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:08,956 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:08,986 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:09,366 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:09,395 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:09,426 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:09,456 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:09,516 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:09,547 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:09,576 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:10,096 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:10,126 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:10,156 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:10,766 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:11,232 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:11,378 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:11,960 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:15,801 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:16,318 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:16,505 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:16,522 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:16,535 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:16,565 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:17,026 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:17,057 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:17,087 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:17,216 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:17,240 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:17,271 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:17,300 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:17,945 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:17,975 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:18,006 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:18,341 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:18,864 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:19,094 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:19,814 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:23,392 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:23,957 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:24,102 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:24,133 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:24,163 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:24,247 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:24,659 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:24,690 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:24,720 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:24,967 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:24,998 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:25,028 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:25,049 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:25,769 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:25,800 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:25,829 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:25,943 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:26,500 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:26,819 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:27,635 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:31,007 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:31,578 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:31,711 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:31,741 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:31,771 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:31,965 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:32,283 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:32,313 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:32,342 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:32,683 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:32,714 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:32,744 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:32,888 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:33,555 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:33,612 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:33,643 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:33,673 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:34,130 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:34,532 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:35,466 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:38,606 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:39,186 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:39,319 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:39,350 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:39,380 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:39,674 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:39,894 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:39,925 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:39,954 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:40,397 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:40,428 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:40,457 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:40,583 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:41,162 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:41,299 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:41,329 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:41,359 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:41,736 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:42,254 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:43,155 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:46,217 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:46,796 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:46,922 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:46,954 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:46,984 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:47,441 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:47,504 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:47,534 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:47,564 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:48,164 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:48,195 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:48,225 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:48,272 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:48,766 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:48,987 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:49,017 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:49,047 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:49,351 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:50,024 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:50,845 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:53,809 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:54,398 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:54,522 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:54,552 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:54,583 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:55,106 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:55,139 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:55,170 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:55,199 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:55,915 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:55,943 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj
+[OpenTinker] 2025-12-01 13:21:55,945 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:55,975 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:56,358 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:56,651 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:21:56,681 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:21:56,712 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:21:56,944 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:57,769 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj
+[OpenTinker] 2025-12-01 13:21:58,506 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:01,409 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:01,987 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:02,115 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:02,146 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:02,178 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:02,689 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:02,719 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:02,750 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:02,920 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:03,589 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:03,635 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:03,665 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:03,695 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:03,954 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:04,294 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:04,324 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:04,354 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:04,537 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:05,484 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:06,135 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:08,981 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:09,589 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:09,678 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:09,708 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:09,738 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:10,286 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:10,316 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:10,346 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:10,647 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:11,246 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:11,361 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:11,391 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:11,421 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:11,520 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:11,949 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:11,978 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:12,008 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:12,134 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:13,210 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:13,802 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:16,572 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:17,177 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:17,285 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:17,314 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:17,344 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:17,881 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:17,911 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:17,940 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:18,370 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:18,904 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:19,092 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:19,122 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:19,128 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:19,152 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:19,614 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:19,644 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:19,674 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:19,721 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:20,944 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:21,464 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:24,184 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:24,777 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:24,886 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:24,915 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:24,945 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:25,476 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:25,505 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:25,535 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:26,097 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:26,554 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:26,725 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:26,812 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:26,842 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:26,872 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:27,257 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:27,287 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:27,314 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:27,317 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:28,663 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:29,118 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:31,794 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:32,354 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:32,495 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:32,525 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:32,555 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:33,054 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:33,084 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:33,113 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:33,818 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:34,221 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:34,354 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:34,533 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:34,563 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:34,593 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:34,902 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:34,926 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:34,956 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:34,986 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:36,390 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:36,779 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:39,409 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:39,942 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:40,112 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:40,142 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:40,171 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:40,643 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:40,672 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:40,702 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:41,557 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:41,885 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:41,958 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:42,275 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:42,306 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:42,335 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:42,484 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:42,593 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:42,623 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:42,654 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:44,135 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:44,451 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:47,021 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:47,550 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:47,726 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:47,756 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:47,788 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:48,259 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:48,289 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:48,319 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:49,294 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:49,572 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:49,596 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:50,013 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:50,044 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:50,073 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:50,099 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:50,308 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:50,339 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:50,369 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:51,866 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:52,170 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:54,634 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:55,161 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:55,338 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:55,368 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:55,398 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:55,866 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:55,896 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:55,925 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:57,024 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:57,194 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:57,297 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj
+[OpenTinker] 2025-12-01 13:22:57,702 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:57,740 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:57,770 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:57,800 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:58,009 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:22:58,039 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:22:58,069 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:22:59,587 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj
+[OpenTinker] 2025-12-01 13:22:59,862 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:02,256 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:02,756 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:02,960 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:02,990 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:03,020 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:03,459 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:03,489 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:03,519 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:04,745 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:04,804 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:05,000 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:05,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:05,469 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:05,499 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:05,530 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:05,711 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:05,741 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:05,771 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:07,326 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:07,565 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:09,855 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:10,351 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:10,554 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:10,585 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:10,614 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:11,054 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:11,084 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:11,114 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:12,406 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:12,487 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:12,683 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:12,895 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:13,203 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:13,232 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:13,263 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:13,390 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:13,420 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:13,449 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:15,059 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:15,249 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:17,476 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:17,963 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:18,177 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:18,207 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:18,237 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:18,668 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:18,698 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:18,728 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:20,022 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:20,242 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:20,414 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:20,509 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:20,958 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:20,988 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:21,017 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:21,122 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:21,152 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:21,181 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:22,810 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:22,968 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:25,088 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:25,567 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:25,797 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:25,827 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:25,857 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:26,271 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:26,301 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:26,331 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:27,644 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:27,980 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:28,107 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:28,114 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:28,699 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:28,729 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:28,759 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:28,817 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:28,847 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:28,876 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:30,560 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:30,688 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:32,718 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:33,176 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:33,421 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:33,451 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:33,481 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:33,882 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:33,912 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:33,941 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:35,262 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:35,726 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:35,732 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:35,819 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:36,445 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:36,476 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:36,505 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:36,532 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:36,561 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:36,591 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:38,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:38,390 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:40,348 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:40,808 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:41,057 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:41,087 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:41,117 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:41,522 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:41,552 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:41,582 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:42,893 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:43,375 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:43,483 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:43,536 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:44,203 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:44,234 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:44,252 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:44,263 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:44,283 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:44,312 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:46,055 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:46,114 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:47,952 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:48,435 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:48,657 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:48,687 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:48,716 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:49,139 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:49,169 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:49,199 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:50,499 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:50,994 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:51,236 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:51,267 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:51,953 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:51,979 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:51,983 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:52,009 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:52,013 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:52,039 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:53,818 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:53,834 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:55,560 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:56,058 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:56,266 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:56,296 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:56,328 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:56,761 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:56,791 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:56,821 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:58,108 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:58,607 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj
+[OpenTinker] 2025-12-01 13:23:58,983 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:59,001 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj
+[OpenTinker] 2025-12-01 13:23:59,693 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:59,713 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:23:59,723 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:59,744 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:23:59,753 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:23:59,773 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:24:01,551 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj
+[OpenTinker] 2025-12-01 13:24:01,577 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj
+[OpenTinker] 2025-12-01 13:24:03,165 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:03,681 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:03,865 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:24:03,894 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:24:03,924 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:24:04,381 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:24:04,411 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:24:04,441 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:24:05,693 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj
+[OpenTinker] 2025-12-01 13:24:06,229 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj
+[OpenTinker] 2025-12-01 13:24:06,672 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:06,749 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:07,380 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:24:07,410 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:24:07,439 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:24:07,460 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj
+[OpenTinker] 2025-12-01 13:24:07,490 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj
+[OpenTinker] 2025-12-01 13:24:07,520 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj
+[OpenTinker] 2025-12-01 13:24:09,226 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj
+[OpenTinker] 2025-12-01 13:24:09,313 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj
+[OpenTinker] 2025-12-01 13:24:10,752 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:10,753 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 213.05 seconds
+[OpenTinker] 2025-12-01 13:24:10,754 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-01 13:24:11,252 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpxmtxyw6d/test.c -o /tmp/tmpxmtxyw6d/test.o
+[OpenTinker] 2025-12-01 13:24:11,280 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpxmtxyw6d/test.o -laio -o /tmp/tmpxmtxyw6d/a.out
+[OpenTinker] 2025-12-01 13:24:11,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:11,299 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 213.25 seconds
+[OpenTinker] 2025-12-01 13:24:11,300 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-01 13:24:11,606 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpdzsydi46/test.c -o /tmp/tmpdzsydi46/test.o
+[OpenTinker] 2025-12-01 13:24:11,632 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpdzsydi46/test.o -laio -o /tmp/tmpdzsydi46/a.out
+[OpenTinker] 2025-12-01 13:24:11,778 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp14hhwhxl/test.c -o /tmp/tmp14hhwhxl/test.o
+[OpenTinker] 2025-12-01 13:24:11,806 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp14hhwhxl/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp14hhwhxl/a.out
+[OpenTinker] 2025-12-01 13:24:12,087 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpxl1e93d0/test.c -o /tmp/tmpxl1e93d0/test.o
+[OpenTinker] 2025-12-01 13:24:12,121 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpxl1e93d0/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpxl1e93d0/a.out
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+[OpenTinker] 2025-12-01 13:24:14,348 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:14,348 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 216.45 seconds
+[OpenTinker] 2025-12-01 13:24:14,349 - root - INFO - Lora configured successfully
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Using network Socket
+[OpenTinker] 2025-12-01 13:24:14,492 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj
+[OpenTinker] 2025-12-01 13:24:14,493 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 216.70 seconds
+[OpenTinker] 2025-12-01 13:24:14,493 - root - INFO - Lora configured successfully
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO ncclCommInitRankConfig comm 0x16a41420 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0xb34e3953a9afc2f1 - Init START
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO ncclCommInitRankConfig comm 0x1882be50 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0xb34e3953a9afc2f1 - Init START
+[OpenTinker] 2025-12-01 13:24:14,798 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpkw1j4eyo/test.c -o /tmp/tmpkw1j4eyo/test.o
+[OpenTinker] 2025-12-01 13:24:14,826 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpkw1j4eyo/test.o -laio -o /tmp/tmpkw1j4eyo/a.out
+[OpenTinker] 2025-12-01 13:24:14,852 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpu4eu2mm2/test.c -o /tmp/tmpu4eu2mm2/test.o
+[OpenTinker] 2025-12-01 13:24:14,876 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpu4eu2mm2/test.o -laio -o /tmp/tmpu4eu2mm2/a.out
+[OpenTinker] 2025-12-01 13:24:15,335 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpp2jhwke4/test.c -o /tmp/tmpp2jhwke4/test.o
+[OpenTinker] 2025-12-01 13:24:15,347 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp8e96nr6y/test.c -o /tmp/tmp8e96nr6y/test.o
+[OpenTinker] 2025-12-01 13:24:15,363 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpp2jhwke4/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpp2jhwke4/a.out
+[OpenTinker] 2025-12-01 13:24:15,381 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp8e96nr6y/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp8e96nr6y/a.out
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO cudaDriverVersion 12090
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0>
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Initialized NET plugin Socket
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO ncclCommInitRankConfig comm 0x13666ca0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0xb34e3953a9afc2f1 - Init START
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO ncclCommInitRankConfig comm 0x18bf06d0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0xb34e3953a9afc2f1 - Init START
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Bootstrap timings total 0.000750 (create 0.000021, send 0.000098, recv 0.000224, ring 0.000132, delay 0.000000)
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Bootstrap timings total 0.000796 (create 0.000021, send 0.000113, recv 0.000148, ring 0.000174, delay 0.000001)
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Bootstrap timings total 3.639589 (create 0.000020, send 0.000102, recv 3.638934, ring 0.000101, delay 0.000001)
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Bootstrap timings total 3.641966 (create 0.000021, send 0.000091, recv 3.641426, ring 0.000041, delay 0.000001)
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO comm 0x1882be50 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO comm 0x13666ca0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO comm 0x18bf06d0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO comm 0x16a41420 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1172059:1173271 [2] NCCL INFO [Proxy Service] Device 2 CPU core 16
+lshn-qs-pjul-8:1172059:1173272 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 139
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1172058:1173273 [1] NCCL INFO [Proxy Service] Device 1 CPU core 107
+lshn-qs-pjul-8:1172058:1173274 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 17
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1172060:1173275 [3] NCCL INFO [Proxy Service] Device 3 CPU core 137
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-pjul-8:1172060:1173276 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 0
+lshn-qs-pjul-8:1172057:1173277 [0] NCCL INFO [Proxy Service] Device 0 CPU core 1
+lshn-qs-pjul-8:1172057:1173278 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 6
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO ncclCommInitRankConfig comm 0x13666ca0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0xb34e3953a9afc2f1 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.58 (kernels 0.21, alloc 0.28, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.02)
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO ncclCommInitRankConfig comm 0x16a41420 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0xb34e3953a9afc2f1 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 4.24 (kernels 0.14, alloc 0.37, bootstrap 3.64, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.02)
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO ncclCommInitRankConfig comm 0x18bf06d0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0xb34e3953a9afc2f1 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.58 (kernels 0.21, alloc 0.28, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.02)
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO ncclCommInitRankConfig comm 0x1882be50 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0xb34e3953a9afc2f1 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 4.22 (kernels 0.14, alloc 0.35, bootstrap 3.64, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.03)
+[OpenTinker] 2025-12-01 13:24:18,716 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-01 13:24:18,736 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-01 13:24:18,736 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-01 13:24:18,737 - root - INFO - Training model with GRPO
+INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896
+INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896
+INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896
+INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896
+INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-01 13:24:37 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-01 13:24:38 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-01 13:24:38 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 12-01 13:24:38 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+[rank3]:[W1201 13:24:39.729535628 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+[rank2]:[W1201 13:24:39.840665733 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+[rank0]:[W1201 13:24:39.847855218 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+[rank1]:[W1201 13:24:39.880720742 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO ncclCommSplit comm 0x19c7aa30 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 1 color 2003953581 key 1- Init START
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO ncclCommSplit comm 0x155745e0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 1 color 2003953581 key 3- Init START
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO ncclCommSplit comm 0x1aca76b0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 1 color 2003953581 key 0- Init START
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO ncclCommSplit comm 0x1ba5b2b0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 1 color 2003953581 key 2- Init START
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO comm 0x1ba5b2b0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO comm 0x19c7aa30 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO comm 0x1aca76b0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO comm 0x155745e0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173418 [3] NCCL INFO [Proxy Service] Device 3 CPU core 10
+lshn-qs-pjul-8:1172060:1173419 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 11
+lshn-qs-pjul-8:1172059:1173420 [2] NCCL INFO [Proxy Service] Device 2 CPU core 114
+lshn-qs-pjul-8:1172059:1173421 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 19
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-pjul-8:1172057:1173422 [0] NCCL INFO [Proxy Service] Device 0 CPU core 116
+lshn-qs-pjul-8:1172057:1173423 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 21
+lshn-qs-pjul-8:1172058:1173424 [1] NCCL INFO [Proxy Service] Device 1 CPU core 26
+lshn-qs-pjul-8:1172058:1173425 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 28
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO ncclCommSplit comm 0x155745e0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 1 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO ncclCommSplit comm 0x1aca76b0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 1 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO ncclCommSplit comm 0x19c7aa30 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 1 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO ncclCommSplit comm 0x1ba5b2b0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 1 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.21 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.16)
+lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.04)
+lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.05)
+[Gloo] Rank 0 is connected to 3[Gloo] Rank 1 peer ranks. Expected number of connected peer ranks is : [Gloo] Rank 2[Gloo] Rank 3 is connected to 33
+ is connected to 3 is connected to 3 peer ranks. Expected number of connected peer ranks is :  peer ranks. Expected number of connected peer ranks is :  peer ranks. Expected number of connected peer ranks is : 3
+3
+3
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO ncclCommSplit comm 0x1adbb6b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 2 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO comm 0x1adbb6b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172057:1173448 [0] NCCL INFO [Proxy Service] Device 0 CPU core 7
+lshn-qs-pjul-8:1172057:1173449 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 34
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO ncclCommSplit comm 0x1adbb6b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 2 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO ncclCommSplit comm 0x19d8f5e0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 4 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO comm 0x19d8f5e0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172058:1173463 [1] NCCL INFO [Proxy Service] Device 1 CPU core 12
+lshn-qs-pjul-8:1172058:1173464 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 0
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO ncclCommSplit comm 0x19d8f5e0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 4 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO ncclCommSplit comm 0x1bb6f140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 6 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO comm 0x1bb6f140 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172059:1173478 [2] NCCL INFO [Proxy Service] Device 2 CPU core 109
+lshn-qs-pjul-8:1172059:1173479 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 99
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO ncclCommSplit comm 0x1bb6f140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 6 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO ncclCommSplit comm 0x15688960 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 8 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO comm 0x15688960 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172060:1173495 [3] NCCL INFO [Proxy Service] Device 3 CPU core 116
+lshn-qs-pjul-8:1172060:1173496 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 19
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO ncclCommSplit comm 0x15688960 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 8 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO ncclCommSplit comm 0x1c505190 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 9 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO comm 0x1c505190 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173501 [0] NCCL INFO [Proxy Service] Device 0 CPU core 103
+lshn-qs-pjul-8:1172057:1173505 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 131
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO ncclCommSplit comm 0x1c505190 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 9 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO ncclCommSplit comm 0x1b4ebe30 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 11 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO comm 0x1b4ebe30 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172058:1173519 [1] NCCL INFO [Proxy Service] Device 1 CPU core 123
+lshn-qs-pjul-8:1172058:1173520 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 130
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO ncclCommSplit comm 0x1b4ebe30 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 11 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.05, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO ncclCommSplit comm 0x1bc771b0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 13 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO comm 0x1bc771b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172059:1173534 [2] NCCL INFO [Proxy Service] Device 2 CPU core 137
+lshn-qs-pjul-8:1172059:1173535 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 14
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO ncclCommSplit comm 0x1bc771b0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 13 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO ncclCommSplit comm 0x157909d0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 15 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO comm 0x157909d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172060:1173551 [3] NCCL INFO [Proxy Service] Device 3 CPU core 99
+lshn-qs-pjul-8:1172060:1173552 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 109
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO ncclCommSplit comm 0x157909d0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 15 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO ncclCommSplit comm 0x1c60cda0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 16 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO comm 0x1c60cda0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172057:1173560 [0] NCCL INFO [Proxy Service] Device 0 CPU core 105
+lshn-qs-pjul-8:1172057:1173561 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 143
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO ncclCommSplit comm 0x1c60cda0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 16 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO ncclCommSplit comm 0x1b5f3a40 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 18 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO comm 0x1b5f3a40 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172058:1173575 [1] NCCL INFO [Proxy Service] Device 1 CPU core 42
+lshn-qs-pjul-8:1172058:1173576 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 22
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO ncclCommSplit comm 0x1b5f3a40 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 18 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO ncclCommSplit comm 0x19012070 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 20 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO comm 0x19012070 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172059:1173590 [2] NCCL INFO [Proxy Service] Device 2 CPU core 132
+lshn-qs-pjul-8:1172059:1173591 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 116
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO ncclCommSplit comm 0x19012070 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 20 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO ncclCommSplit comm 0x1b6d3200 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 22 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO comm 0x1b6d3200 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172060:1173607 [3] NCCL INFO [Proxy Service] Device 3 CPU core 120
+lshn-qs-pjul-8:1172060:1173608 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 39
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO ncclCommSplit comm 0x1b6d3200 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 22 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO ncclCommSplit comm 0x1c7149b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 23 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO comm 0x1c7149b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172057:1173616 [0] NCCL INFO [Proxy Service] Device 0 CPU core 25
+lshn-qs-pjul-8:1172057:1173617 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 12
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO ncclCommSplit comm 0x1c7149b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 23 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.03)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO ncclCommSplit comm 0x1b6fb650 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 25 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO comm 0x1b6fb650 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172058:1173631 [1] NCCL INFO [Proxy Service] Device 1 CPU core 44
+lshn-qs-pjul-8:1172058:1173632 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 0
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO ncclCommSplit comm 0x1b6fb650 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 25 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO ncclCommSplit comm 0x19119c80 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 27 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO comm 0x19119c80 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172059:1173646 [2] NCCL INFO [Proxy Service] Device 2 CPU core 16
+lshn-qs-pjul-8:1172059:1173647 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 19
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO ncclCommSplit comm 0x19119c80 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 27 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO ncclCommSplit comm 0x1b7dae10 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 29 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO comm 0x1b7dae10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172060:1173663 [3] NCCL INFO [Proxy Service] Device 3 CPU core 42
+lshn-qs-pjul-8:1172060:1173664 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 7
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO ncclCommSplit comm 0x1b7dae10 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 29 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO ncclCommSplit comm 0x1c81c5c0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 30 color 59908776 key 0- Init START
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO comm 0x1c81c5c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172057:1173672 [0] NCCL INFO [Proxy Service] Device 0 CPU core 136
+lshn-qs-pjul-8:1172057:1173673 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 105
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO ncclCommSplit comm 0x1c81c5c0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 30 color 59908776 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO ncclCommSplit comm 0x1b803260 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 32 color 440515407 key 0- Init START
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO comm 0x1b803260 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172058:1173687 [1] NCCL INFO [Proxy Service] Device 1 CPU core 116
+lshn-qs-pjul-8:1172058:1173688 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 124
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO ncclCommSplit comm 0x1b803260 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 32 color 440515407 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO ncclCommSplit comm 0x19221890 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 34 color 1227022723 key 0- Init START
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO comm 0x19221890 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172059:1173702 [2] NCCL INFO [Proxy Service] Device 2 CPU core 3
+lshn-qs-pjul-8:1172059:1173703 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 1
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO ncclCommSplit comm 0x19221890 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 34 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Using network Socket
+INFO 12-01 13:24:40 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-01 13:24:40 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-01 13:24:40 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO ncclCommSplit comm 0x1b8e2a20 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 36 color 1301067556 key 0- Init START
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO comm 0x1b8e2a20 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-pjul-8:1172060:1173714 [3] NCCL INFO [Proxy Service] Device 3 CPU core 23
+lshn-qs-pjul-8:1172060:1173715 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 126
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO ncclCommSplit comm 0x1b8e2a20 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 36 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+INFO 12-01 13:24:40 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-01 13:24:41 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 12-01 13:24:42 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+[AINFO 12-01 13:24:42 [default_loader.py:268] Loading weights took 0.62 seconds
+INFO 12-01 13:24:43 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.82it/s]
+[ALoading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.82it/s]
+
+INFO 12-01 13:24:43 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 1.914775 seconds
+INFO 12-01 13:24:43 [default_loader.py:268] Loading weights took 0.62 seconds
+INFO 12-01 13:24:43 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 12-01 13:24:43 [default_loader.py:268] Loading weights took 0.61 seconds
+INFO 12-01 13:24:43 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 2.392506 seconds
+INFO 12-01 13:24:44 [default_loader.py:268] Loading weights took 0.61 seconds
+INFO 12-01 13:24:44 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 2.941163 seconds
+INFO 12-01 13:24:44 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 3.506916 seconds
+INFO 12-01 13:24:48 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_1_0/backbone for vLLM's torch.compile
+INFO 12-01 13:24:48 [backends.py:550] Dynamo bytecode transform time: 5.55 s
+INFO 12-01 13:24:49 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_0_0/backbone for vLLM's torch.compile
+INFO 12-01 13:24:49 [backends.py:550] Dynamo bytecode transform time: 5.49 s
+INFO 12-01 13:24:49 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_2_0/backbone for vLLM's torch.compile
+INFO 12-01 13:24:49 [backends.py:550] Dynamo bytecode transform time: 5.42 s
+INFO 12-01 13:24:50 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_3_0/backbone for vLLM's torch.compile
+INFO 12-01 13:24:50 [backends.py:550] Dynamo bytecode transform time: 5.32 s
+INFO 12-01 13:24:51 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.665 s
+INFO 12-01 13:24:52 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.543 s
+INFO 12-01 13:24:52 [monitor.py:34] torch.compile takes 5.55 s in total
+INFO 12-01 13:24:52 [monitor.py:34] torch.compile takes 5.49 s in total
+INFO 12-01 13:24:52 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.573 s
+INFO 12-01 13:24:52 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.540 s
+INFO 12-01 13:24:53 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-01 13:24:53 [monitor.py:34] torch.compile takes 5.42 s in total
+INFO 12-01 13:24:53 [monitor.py:34] torch.compile takes 5.32 s in total
+INFO 12-01 13:24:53 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-01 13:24:54 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-01 13:24:54 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 12-01 13:24:54 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-01 13:24:54 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-01 13:24:54 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-01 13:24:54 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 12-01 13:24:54 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-01 13:24:54 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-01 13:24:54 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 12-01 13:24:54 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/11 [00:00<?, ?it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  36%|███▋      | 4/11 [00:00<00:00, 34.12it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  73%|███████▎  | 8/11 [00:00<00:00, 36.69it/s][ACapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 11/11 [00:00<00:00, 38.15it/s]
+INFO 12-01 13:24:55 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-01 13:24:55 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-01 13:24:55 [core.py:218] init engine (profile, create kv cache, warmup model) took 10.70 seconds
+INFO 12-01 13:24:55 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-01 13:24:55 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-01 13:24:55 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-01 13:24:55 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-01 13:24:55 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 12-01 13:24:55 [core.py:218] init engine (profile, create kv cache, warmup model) took 11.27 seconds
+INFO 12-01 13:24:55 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 12-01 13:24:55 [core.py:218] init engine (profile, create kv cache, warmup model) took 11.80 seconds
+INFO 12-01 13:24:55 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.47 seconds
+INFO 12-01 13:24:56 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-01 13:24:56 [__init__.py:36] No IOProcessor plugins requested by the model
+INFO 12-01 13:24:56 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-01 13:24:56 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM
+INFO 12-01 13:24:56 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-01 13:24:56 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
+INFO 12-01 13:24:56 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-01 13:24:56 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+[OpenTinker] 2025-12-01 13:24:57,487 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value.
+lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Using network Socket
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO ncclCommSplit comm 0x4a601550 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 37 color 2003953581 key 1- Init START
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO ncclCommSplit comm 0x4a6d8c30 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 37 color 2003953581 key 3- Init START
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO ncclCommSplit comm 0x4a267b90 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 37 color 2003953581 key 2- Init START
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO ncclCommSplit comm 0x4b5afb40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 37 color 2003953581 key 0- Init START
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO comm 0x4a267b90 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO comm 0x4b5afb40 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO comm 0x4a601550 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO comm 0x4a6d8c30 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-pjul-8:1172058:1173849 [1] NCCL INFO [Proxy Service] Device 1 CPU core 122
+lshn-qs-pjul-8:1172059:1173850 [2] NCCL INFO [Proxy Service] Device 2 CPU core 14
+lshn-qs-pjul-8:1172058:1173851 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 127
+lshn-qs-pjul-8:1172059:1173852 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 15
+lshn-qs-pjul-8:1172060:1173853 [3] NCCL INFO [Proxy Service] Device 3 CPU core 43
+lshn-qs-pjul-8:1172060:1173854 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 44
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-pjul-8:1172057:1173855 [0] NCCL INFO [Proxy Service] Device 0 CPU core 130
+lshn-qs-pjul-8:1172057:1173856 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 133
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO ncclCommSplit comm 0x4a601550 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 37 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO ncclCommSplit comm 0x4a267b90 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 37 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO ncclCommSplit comm 0x4b5afb40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 37 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO ncclCommSplit comm 0x4a6d8c30 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 37 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+INFO 12-01 13:24:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:24:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:24:58 [block_pool.py:292] Successfully reset prefix cache
+wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
+
+  0%|          | 0/1024 [00:00<?, ?it/s][AINFO 12-01 13:24:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:25:01 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-01 13:25:01 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-01 13:25:02 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-01 13:25:02 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 1/1024 [02:22<40:25:53, 142.28s/it][A
+                                                    [A{'loss': 0.0591, 'grad_norm': 0.004114801995456219, 'learning_rate': 1e-05, 'num_tokens': 792270.0, 'completions/mean_length': 6039.171875, 'completions/min_length': 250.0, 'completions/max_length': 15689.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6039.171875, 'completions/min_terminated_length': 250.0, 'completions/max_terminated_length': 15689.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021642697975039482, 'sampling/sampling_logp_difference/max': 8.311287879943848, 'sampling/importance_sampling_ratio/min': 0.0002457273658365011, 'sampling/importance_sampling_ratio/mean': 0.9999940395355225, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.19118632376194, 'clip_ratio/low_mean': 0.0, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 0.0, 'epoch': 0.0}
+
+  0%|          | 1/1024 [02:22<40:25:53, 142.28s/it][AINFO 12-01 13:27:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:27:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:27:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:27:21 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 2/1024 [04:30<38:06:00, 134.21s/it][A
+                                                    [A{'loss': 0.0407, 'grad_norm': 0.004017667844891548, 'learning_rate': 1e-05, 'num_tokens': 1452816.0, 'completions/mean_length': 4978.265625, 'completions/min_length': 395.0, 'completions/max_length': 15112.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4978.265625, 'completions/min_terminated_length': 395.0, 'completions/max_terminated_length': 15112.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.30798622965812683, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01879144087433815, 'sampling/sampling_logp_difference/max': 4.778462886810303, 'sampling/importance_sampling_ratio/min': 0.00840891432017088, 'sampling/importance_sampling_ratio/mean': 0.9999986290931702, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9862165078520775, 'clip_ratio/low_mean': 5.146006606082665e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.031489318847889e-06, 'clip_ratio/high_max': 4.125957275391556e-06, 'clip_ratio/region_mean': 5.249155537967454e-05, 'epoch': 0.0}
+
+  0%|          | 2/1024 [04:30<38:06:00, 134.21s/it][AINFO 12-01 13:29:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:29:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:29:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:29:30 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 3/1024 [07:10<41:18:14, 145.64s/it][A
+                                                    [A{'loss': 0.0606, 'grad_norm': 0.002359058242291212, 'learning_rate': 1e-05, 'num_tokens': 2324415.0, 'completions/mean_length': 6664.3046875, 'completions/min_length': 477.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6587.771484375, 'completions/min_terminated_length': 477.0, 'completions/max_terminated_length': 16118.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.3090519607067108, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02111843228340149, 'sampling/sampling_logp_difference/max': 6.311188697814941, 'sampling/importance_sampling_ratio/min': 0.0018158734310418367, 'sampling/importance_sampling_ratio/mean': 1.0000684261322021, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9934953600168228, 'clip_ratio/low_mean': 5.7621912446848e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4842080418020487e-06, 'clip_ratio/high_max': 5.936832167208195e-06, 'clip_ratio/region_mean': 5.910612048865005e-05, 'epoch': 0.0}
+
+  0%|          | 3/1024 [07:10<41:18:14, 145.64s/it][AINFO 12-01 13:32:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:32:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:32:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:32:09 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 4/1024 [09:35<41:15:49, 145.64s/it][A
+                                                    [A{'loss': 0.0096, 'grad_norm': 0.002563449554145336, 'learning_rate': 1e-05, 'num_tokens': 3091369.0, 'completions/mean_length': 5801.203125, 'completions/min_length': 252.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5717.8740234375, 'completions/min_terminated_length': 252.0, 'completions/max_terminated_length': 15915.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.2580180764198303, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021363306790590286, 'sampling/sampling_logp_difference/max': 3.9732837677001953, 'sampling/importance_sampling_ratio/min': 0.018811559304594994, 'sampling/importance_sampling_ratio/mean': 1.000073790550232, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0870511680841446, 'clip_ratio/low_mean': 2.648322629283939e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5639363911977853e-06, 'clip_ratio/high_max': 1.0255745564791141e-05, 'clip_ratio/region_mean': 2.9047162797724013e-05, 'epoch': 0.0}
+
+  0%|          | 4/1024 [09:35<41:15:49, 145.64s/it][AINFO 12-01 13:34:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:34:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:34:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:34:35 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 5/1024 [12:11<42:17:35, 149.42s/it][A
+                                                    [A{'loss': -0.0344, 'grad_norm': 0.0025258746463805437, 'learning_rate': 1e-05, 'num_tokens': 3841078.0, 'completions/mean_length': 5696.4140625, 'completions/min_length': 539.0, 'completions/max_length': 15767.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5696.4140625, 'completions/min_terminated_length': 539.0, 'completions/max_terminated_length': 15767.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020693503320217133, 'sampling/sampling_logp_difference/max': 13.41861629486084, 'sampling/importance_sampling_ratio/min': 1.4871986877551535e-06, 'sampling/importance_sampling_ratio/mean': 0.999910295009613, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1476548686623573, 'clip_ratio/low_mean': 4.577123684157414e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4204003921113326e-06, 'clip_ratio/high_max': 9.68160156844533e-06, 'clip_ratio/region_mean': 4.8191637006311794e-05, 'epoch': 0.0}
+
+  0%|          | 5/1024 [12:11<42:17:35, 149.42s/it][AINFO 12-01 13:37:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:37:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:37:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:37:11 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 6/1024 [14:35<41:44:46, 147.63s/it][A
+                                                    [A{'loss': 0.0447, 'grad_norm': 0.0050104837864637375, 'learning_rate': 1e-05, 'num_tokens': 4535640.0, 'completions/mean_length': 5280.890625, 'completions/min_length': 296.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5104.65087890625, 'completions/min_terminated_length': 296.0, 'completions/max_terminated_length': 14489.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.35505855083465576, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018597707152366638, 'sampling/sampling_logp_difference/max': 7.237989902496338, 'sampling/importance_sampling_ratio/min': 0.0007187551236711442, 'sampling/importance_sampling_ratio/mean': 0.9998958110809326, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8976912423968315, 'clip_ratio/low_mean': 3.895585894042597e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.029715701861278e-05, 'clip_ratio/high_max': 3.660332322397153e-05, 'clip_ratio/region_mean': 4.925301630009926e-05, 'epoch': 0.01}
+
+  1%|          | 6/1024 [14:35<41:44:46, 147.63s/it][AINFO 12-01 13:39:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:39:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:39:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:39:35 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 7/1024 [17:15<42:46:16, 151.40s/it][A
+                                                    [A{'loss': 0.0003, 'grad_norm': 0.0018693821039050817, 'learning_rate': 1e-05, 'num_tokens': 5434801.0, 'completions/mean_length': 6874.9453125, 'completions/min_length': 902.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6568.20166015625, 'completions/min_terminated_length': 902.0, 'completions/max_terminated_length': 16166.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.21778544783592224, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020383886992931366, 'sampling/sampling_logp_difference/max': 14.454351425170898, 'sampling/importance_sampling_ratio/min': 5.279039783090411e-07, 'sampling/importance_sampling_ratio/mean': 0.9999491572380066, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0286128222942352, 'clip_ratio/low_mean': 2.823482634539687e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1211164974156418e-06, 'clip_ratio/high_max': 4.484465989662567e-06, 'clip_ratio/region_mean': 2.9355942729125672e-05, 'epoch': 0.01}
+
+  1%|          | 7/1024 [17:15<42:46:16, 151.40s/it][AINFO 12-01 13:42:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:42:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:42:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:42:14 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 8/1024 [19:46<42:42:17, 151.32s/it][A
+                                                    [A{'loss': 0.0602, 'grad_norm': 0.005809026304632425, 'learning_rate': 1e-05, 'num_tokens': 6128708.0, 'completions/mean_length': 5270.5234375, 'completions/min_length': 239.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5094.119140625, 'completions/min_terminated_length': 239.0, 'completions/max_terminated_length': 15910.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.37320882081985474, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02038305625319481, 'sampling/sampling_logp_difference/max': 11.694463729858398, 'sampling/importance_sampling_ratio/min': 8.339863597939257e-06, 'sampling/importance_sampling_ratio/mean': 0.9999240636825562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0461085885763168, 'clip_ratio/low_mean': 5.802649661745818e-05, 'clip_ratio/low_min': 6.0229353948670905e-06, 'clip_ratio/high_mean': 7.537758676789963e-06, 'clip_ratio/high_max': 2.5703585606606794e-05, 'clip_ratio/region_mean': 6.556425523740472e-05, 'epoch': 0.01}
+
+  1%|          | 8/1024 [19:46<42:42:17, 151.32s/it][AINFO 12-01 13:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:44:45 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 9/1024 [22:01<41:16:53, 146.42s/it][A
+                                                    [A{'loss': 0.1005, 'grad_norm': 0.002717240946367383, 'learning_rate': 1e-05, 'num_tokens': 6726587.0, 'completions/mean_length': 4524.6796875, 'completions/min_length': 60.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4431.29931640625, 'completions/min_terminated_length': 60.0, 'completions/max_terminated_length': 14850.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32325831055641174, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017633724957704544, 'sampling/sampling_logp_difference/max': 8.49998950958252, 'sampling/importance_sampling_ratio/min': 0.0002034705103142187, 'sampling/importance_sampling_ratio/mean': 0.9999507665634155, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8275458142161369, 'clip_ratio/low_mean': 2.037043998370791e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2004183304270555e-05, 'clip_ratio/high_max': 3.965832502217381e-05, 'clip_ratio/region_mean': 3.237462271954428e-05, 'epoch': 0.01}
+
+  1%|          | 9/1024 [22:01<41:16:53, 146.42s/it][AINFO 12-01 13:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:47:01 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 10/1024 [24:56<43:40:56, 155.09s/it][A
+                                                     [A{'loss': 0.0597, 'grad_norm': 0.005626584868878126, 'learning_rate': 1e-05, 'num_tokens': 7400273.0, 'completions/mean_length': 5129.171875, 'completions/min_length': 332.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4950.52392578125, 'completions/min_terminated_length': 332.0, 'completions/max_terminated_length': 15243.0, 'rewards/accuracy_reward/mean': 0.6796875, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.6796875, 'reward_std': 0.379814088344574, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016138140112161636, 'sampling/sampling_logp_difference/max': 5.999964237213135, 'sampling/importance_sampling_ratio/min': 0.002478840760886669, 'sampling/importance_sampling_ratio/mean': 1.0000073909759521, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7103187441825867, 'clip_ratio/low_mean': 3.394487077912345e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7720051346259424e-06, 'clip_ratio/high_max': 7.08802053850377e-06, 'clip_ratio/region_mean': 3.571687602743623e-05, 'epoch': 0.01}
+
+  1%|          | 10/1024 [24:56<43:40:56, 155.09s/it][AINFO 12-01 13:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:49:56 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 11/1024 [27:14<42:12:27, 150.00s/it][A
+                                                     [A{'loss': 0.0285, 'grad_norm': 0.0022279289551079273, 'learning_rate': 1e-05, 'num_tokens': 8026991.0, 'completions/mean_length': 4741.296875, 'completions/min_length': 556.0, 'completions/max_length': 15482.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4741.296875, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 15482.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.1701665222644806, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01844138652086258, 'sampling/sampling_logp_difference/max': 8.999829292297363, 'sampling/importance_sampling_ratio/min': 0.00012343087291810662, 'sampling/importance_sampling_ratio/mean': 0.9999547004699707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.903806746006012, 'clip_ratio/low_mean': 1.9378599517949624e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.9378599517949624e-05, 'epoch': 0.01}
+
+  1%|          | 11/1024 [27:14<42:12:27, 150.00s/it][AINFO 12-01 13:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:52:14 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 12/1024 [29:47<42:24:29, 150.86s/it][A
+                                                     [A{'loss': 0.1298, 'grad_norm': 0.0038395742885768414, 'learning_rate': 1e-05, 'num_tokens': 8797134.0, 'completions/mean_length': 5855.5546875, 'completions/min_length': 453.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5602.8720703125, 'completions/min_terminated_length': 453.0, 'completions/max_terminated_length': 15569.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3503503203392029, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02074582129716873, 'sampling/sampling_logp_difference/max': 7.1811676025390625, 'sampling/importance_sampling_ratio/min': 0.0007607790757901967, 'sampling/importance_sampling_ratio/mean': 0.9999397397041321, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.047883652150631, 'clip_ratio/low_mean': 4.368338659332949e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.93793562175415e-06, 'clip_ratio/high_max': 4.8331594371120445e-06, 'clip_ratio/region_mean': 4.562132153296261e-05, 'epoch': 0.01}
+
+  1%|          | 12/1024 [29:47<42:24:29, 150.86s/it][AINFO 12-01 13:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:54:47 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 13/1024 [32:17<42:18:21, 150.64s/it][A
+                                                     [A{'loss': 0.0197, 'grad_norm': 0.0011414350010454655, 'learning_rate': 1e-05, 'num_tokens': 9691639.0, 'completions/mean_length': 6816.6953125, 'completions/min_length': 99.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6664.83349609375, 'completions/min_terminated_length': 99.0, 'completions/max_terminated_length': 15365.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.1354655921459198, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.021673155948519707, 'sampling/sampling_logp_difference/max': 17.785776138305664, 'sampling/importance_sampling_ratio/min': 1.8868423268258994e-08, 'sampling/importance_sampling_ratio/mean': 0.9998582601547241, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1763990670442581, 'clip_ratio/low_mean': 1.2482652891776524e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.680963562961551e-07, 'clip_ratio/high_max': 3.0723854251846205e-06, 'clip_ratio/region_mean': 1.325074924807268e-05, 'epoch': 0.01}
+
+  1%|▏         | 13/1024 [32:17<42:18:21, 150.64s/it][AINFO 12-01 13:57:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:57:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:57:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:57:17 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 14/1024 [34:40<41:33:03, 148.10s/it][A
+                                                     [A{'loss': 0.0163, 'grad_norm': 0.004565369803458452, 'learning_rate': 1e-05, 'num_tokens': 10391515.0, 'completions/mean_length': 5313.53125, 'completions/min_length': 509.0, 'completions/max_length': 15459.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5313.53125, 'completions/min_terminated_length': 509.0, 'completions/max_terminated_length': 15459.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2648528814315796, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020862173289060593, 'sampling/sampling_logp_difference/max': 10.624350547790527, 'sampling/importance_sampling_ratio/min': 2.431661960144993e-05, 'sampling/importance_sampling_ratio/mean': 0.9998889565467834, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0474217981100082, 'clip_ratio/low_mean': 2.299899915669812e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4562712585284316e-06, 'clip_ratio/high_max': 1.3825085034113727e-05, 'clip_ratio/region_mean': 2.6455270244696294e-05, 'epoch': 0.01}
+
+  1%|▏         | 14/1024 [34:40<41:33:03, 148.10s/it][AINFO 12-01 13:59:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:59:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:59:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:59:39 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 15/1024 [37:05<41:15:46, 147.22s/it][A
+                                                     [A{'loss': 0.077, 'grad_norm': 0.006024828180670738, 'learning_rate': 1e-05, 'num_tokens': 11017781.0, 'completions/mean_length': 4732.578125, 'completions/min_length': 110.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4640.83447265625, 'completions/min_terminated_length': 110.0, 'completions/max_terminated_length': 14724.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.2959064245223999, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020809629932045937, 'sampling/sampling_logp_difference/max': 5.786387920379639, 'sampling/importance_sampling_ratio/min': 0.00306904804892838, 'sampling/importance_sampling_ratio/mean': 0.9999477863311768, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0815455242991447, 'clip_ratio/low_mean': 6.22073393969913e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.914024662459269e-06, 'clip_ratio/high_max': 1.1656098649837077e-05, 'clip_ratio/region_mean': 6.512136405945057e-05, 'epoch': 0.01}
+
+  1%|▏         | 15/1024 [37:05<41:15:46, 147.22s/it][AINFO 12-01 14:02:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:02:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:02:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:02:04 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 16/1024 [39:27<40:50:25, 145.86s/it][A
+                                                     [A{'loss': -0.0001, 'grad_norm': 0.0036383175756782293, 'learning_rate': 1e-05, 'num_tokens': 11794972.0, 'completions/mean_length': 5931.4296875, 'completions/min_length': 59.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5849.1259765625, 'completions/min_terminated_length': 59.0, 'completions/max_terminated_length': 15673.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019794823601841927, 'sampling/sampling_logp_difference/max': 8.1495361328125, 'sampling/importance_sampling_ratio/min': 0.00028886934160254896, 'sampling/importance_sampling_ratio/mean': 0.999954342842102, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0069087892770767, 'clip_ratio/low_mean': 2.816210690070875e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3532825278161908e-06, 'clip_ratio/high_max': 5.413130111264763e-06, 'clip_ratio/region_mean': 2.951538942852494e-05, 'epoch': 0.01}
+
+  2%|▏         | 16/1024 [39:27<40:50:25, 145.86s/it][AINFO 12-01 14:04:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:04:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:04:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:04:27 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 17/1024 [41:50<40:28:49, 144.72s/it][A
+                                                     [A{'loss': 0.0478, 'grad_norm': 0.004312732256948948, 'learning_rate': 1e-05, 'num_tokens': 12517443.0, 'completions/mean_length': 5473.6171875, 'completions/min_length': 171.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5387.70849609375, 'completions/min_terminated_length': 171.0, 'completions/max_terminated_length': 14139.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020093362778425217, 'sampling/sampling_logp_difference/max': 16.095191955566406, 'sampling/importance_sampling_ratio/min': 1.0231680391825648e-07, 'sampling/importance_sampling_ratio/mean': 0.999938428401947, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0765233263373375, 'clip_ratio/low_mean': 3.421858264118782e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.421858264118782e-05, 'epoch': 0.02}
+
+  2%|▏         | 17/1024 [41:50<40:28:49, 144.72s/it][AINFO 12-01 14:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:06:49 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 18/1024 [44:38<42:24:32, 151.76s/it][A
+                                                     [A{'loss': 0.1382, 'grad_norm': 0.0021260723005980253, 'learning_rate': 1e-05, 'num_tokens': 13384420.0, 'completions/mean_length': 6617.7578125, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6137.45068359375, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15754.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01808803342282772, 'sampling/sampling_logp_difference/max': 3.5302083492279053, 'sampling/importance_sampling_ratio/min': 0.02929881028831005, 'sampling/importance_sampling_ratio/mean': 0.999901294708252, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8550976514816284, 'clip_ratio/low_mean': 4.733878370188904e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.218117280492152e-06, 'clip_ratio/high_max': 2.0872469121968606e-05, 'clip_ratio/region_mean': 5.255690120975487e-05, 'epoch': 0.02}
+
+  2%|▏         | 18/1024 [44:38<42:24:32, 151.76s/it][AINFO 12-01 14:09:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:09:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:09:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:09:37 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 19/1024 [47:09<42:18:56, 151.58s/it][A
+                                                     [A{'loss': 0.0001, 'grad_norm': 0.003563448553904891, 'learning_rate': 1e-05, 'num_tokens': 14081197.0, 'completions/mean_length': 5285.7578125, 'completions/min_length': 399.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5109.595703125, 'completions/min_terminated_length': 399.0, 'completions/max_terminated_length': 14382.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.31116873025894165, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017500173300504684, 'sampling/sampling_logp_difference/max': 7.562292098999023, 'sampling/importance_sampling_ratio/min': 0.000519682711455971, 'sampling/importance_sampling_ratio/mean': 0.9998770952224731, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8321448192000389, 'clip_ratio/low_mean': 3.2195434073400975e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6011682646421832e-06, 'clip_ratio/high_max': 6.404673058568733e-06, 'clip_ratio/region_mean': 3.379660131486162e-05, 'epoch': 0.02}
+
+  2%|▏         | 19/1024 [47:09<42:18:56, 151.58s/it][AINFO 12-01 14:12:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:12:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:12:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:12:08 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 20/1024 [49:15<40:08:57, 143.96s/it][A
+                                                     [A{'loss': 0.0265, 'grad_norm': 0.0048850164748728275, 'learning_rate': 1e-05, 'num_tokens': 14727798.0, 'completions/mean_length': 4918.1953125, 'completions/min_length': 494.0, 'completions/max_length': 13991.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4918.1953125, 'completions/min_terminated_length': 494.0, 'completions/max_terminated_length': 13991.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.37716054916381836, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01855182647705078, 'sampling/sampling_logp_difference/max': 7.780951499938965, 'sampling/importance_sampling_ratio/min': 0.00041761461761780083, 'sampling/importance_sampling_ratio/mean': 0.9999402165412903, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9329824000597, 'clip_ratio/low_mean': 5.128390534991922e-05, 'clip_ratio/low_min': 1.2459845038392814e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.128390534991922e-05, 'epoch': 0.02}
+
+  2%|▏         | 20/1024 [49:15<40:08:57, 143.96s/it][AINFO 12-01 14:14:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:14:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:14:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:14:15 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 21/1024 [51:31<39:25:20, 141.50s/it][A
+                                                     [A{'loss': 0.0443, 'grad_norm': 0.0030562332831323147, 'learning_rate': 1e-05, 'num_tokens': 15421937.0, 'completions/mean_length': 5268.5234375, 'completions/min_length': 445.0, 'completions/max_length': 16202.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5268.5234375, 'completions/min_terminated_length': 445.0, 'completions/max_terminated_length': 16202.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021109789609909058, 'sampling/sampling_logp_difference/max': 6.436427593231201, 'sampling/importance_sampling_ratio/min': 0.0016021198825910687, 'sampling/importance_sampling_ratio/mean': 1.0000249147415161, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1676538437604904, 'clip_ratio/low_mean': 3.091395433330035e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5911904231179506e-06, 'clip_ratio/high_max': 1.0364761692471802e-05, 'clip_ratio/region_mean': 3.350514430167095e-05, 'epoch': 0.02}
+
+  2%|▏         | 21/1024 [51:31<39:25:20, 141.50s/it][AINFO 12-01 14:16:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:16:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:16:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:16:30 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 22/1024 [54:02<40:11:23, 144.39s/it][A
+                                                     [A{'loss': 0.0058, 'grad_norm': 0.003777366131544113, 'learning_rate': 1e-05, 'num_tokens': 16128698.0, 'completions/mean_length': 5361.0703125, 'completions/min_length': 177.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5186.103515625, 'completions/min_terminated_length': 177.0, 'completions/max_terminated_length': 15940.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.26409637928009033, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01940997503697872, 'sampling/sampling_logp_difference/max': 5.43656063079834, 'sampling/importance_sampling_ratio/min': 0.004354433622211218, 'sampling/importance_sampling_ratio/mean': 0.9999063611030579, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9569757729768753, 'clip_ratio/low_mean': 3.064284169340681e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.1634053786765435e-06, 'clip_ratio/high_max': 1.6653621514706174e-05, 'clip_ratio/region_mean': 3.480624718577019e-05, 'epoch': 0.02}
+
+  2%|▏         | 22/1024 [54:02<40:11:23, 144.39s/it][AINFO 12-01 14:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:19:02 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 23/1024 [56:56<42:38:31, 153.36s/it][A
+                                                     [A{'loss': 0.0466, 'grad_norm': 0.005749945063143969, 'learning_rate': 1e-05, 'num_tokens': 17101202.0, 'completions/mean_length': 7428.3125, 'completions/min_length': 694.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6910.21435546875, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 15623.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.33114415407180786, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021208524703979492, 'sampling/sampling_logp_difference/max': 4.540204048156738, 'sampling/importance_sampling_ratio/min': 0.010671229101717472, 'sampling/importance_sampling_ratio/mean': 0.9999775886535645, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0387683138251305, 'clip_ratio/low_mean': 4.881033578385541e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.227950727108691e-06, 'clip_ratio/high_max': 3.227977140340954e-05, 'clip_ratio/region_mean': 5.8038286169903586e-05, 'epoch': 0.02}
+
+  2%|▏         | 23/1024 [56:56<42:38:31, 153.36s/it][AINFO 12-01 14:21:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:21:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:21:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:21:56 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 24/1024 [59:32<42:47:22, 154.04s/it][A
+                                                     [A{'loss': -0.0106, 'grad_norm': 0.005364824552088976, 'learning_rate': 1e-05, 'num_tokens': 17820796.0, 'completions/mean_length': 5462.203125, 'completions/min_length': 358.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5288.841796875, 'completions/min_terminated_length': 358.0, 'completions/max_terminated_length': 15659.0, 'rewards/accuracy_reward/mean': 0.1953125, 'rewards/accuracy_reward/std': 0.3979988098144531, 'reward': 0.1953125, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020104583352804184, 'sampling/sampling_logp_difference/max': 6.603336334228516, 'sampling/importance_sampling_ratio/min': 0.0013558369828388095, 'sampling/importance_sampling_ratio/mean': 0.9999232292175293, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.088257022202015, 'clip_ratio/low_mean': 4.543399086287536e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.36028744338546e-07, 'clip_ratio/high_max': 2.544114977354184e-06, 'clip_ratio/region_mean': 4.6070018697719206e-05, 'epoch': 0.02}
+
+  2%|▏         | 24/1024 [59:32<42:47:22, 154.04s/it][AINFO 12-01 14:24:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:24:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:24:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:24:31 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 25/1024 [1:02:15<43:29:17, 156.71s/it][A
+                                                       [A{'loss': 0.0946, 'grad_norm': 0.002392752794548869, 'learning_rate': 1e-05, 'num_tokens': 18538546.0, 'completions/mean_length': 5423.234375, 'completions/min_length': 55.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5249.25439453125, 'completions/min_terminated_length': 55.0, 'completions/max_terminated_length': 16211.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02011517994105816, 'sampling/sampling_logp_difference/max': 13.310263633728027, 'sampling/importance_sampling_ratio/min': 1.657394705034676e-06, 'sampling/importance_sampling_ratio/mean': 0.9999918341636658, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9123491793870926, 'clip_ratio/low_mean': 3.686837260374887e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8309013941907324e-06, 'clip_ratio/high_max': 1.132360557676293e-05, 'clip_ratio/region_mean': 3.96992739979396e-05, 'epoch': 0.02}
+
+  2%|▏         | 25/1024 [1:02:15<43:29:17, 156.71s/it][AINFO 12-01 14:27:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:27:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:27:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:27:14 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 26/1024 [1:04:28<41:31:33, 149.79s/it][A
+                                                       [A{'loss': 0.0036, 'grad_norm': 0.0036540210712701082, 'learning_rate': 1e-05, 'num_tokens': 19270439.0, 'completions/mean_length': 5574.1640625, 'completions/min_length': 318.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5489.04736328125, 'completions/min_terminated_length': 318.0, 'completions/max_terminated_length': 13978.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.23646268248558044, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02006707340478897, 'sampling/sampling_logp_difference/max': 8.425573348999023, 'sampling/importance_sampling_ratio/min': 0.00021918962011113763, 'sampling/importance_sampling_ratio/mean': 0.9999067783355713, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0090710371732712, 'clip_ratio/low_mean': 1.4927492088645522e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.648429234701325e-06, 'clip_ratio/high_max': 2.127026391463005e-05, 'clip_ratio/region_mean': 2.157592166440736e-05, 'epoch': 0.02}
+
+  3%|▎         | 26/1024 [1:04:28<41:31:33, 149.79s/it][AINFO 12-01 14:29:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:29:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:29:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:29:28 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 27/1024 [1:07:02<41:47:45, 150.92s/it][A
+                                                       [A{'loss': 0.0355, 'grad_norm': 0.006005869247019291, 'learning_rate': 1e-05, 'num_tokens': 20083655.0, 'completions/mean_length': 6204.75, 'completions/min_length': 235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6124.5986328125, 'completions/min_terminated_length': 235.0, 'completions/max_terminated_length': 15061.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.31616854667663574, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0189923457801342, 'sampling/sampling_logp_difference/max': 18.249685287475586, 'sampling/importance_sampling_ratio/min': 1.1864853988186042e-08, 'sampling/importance_sampling_ratio/mean': 0.9999549388885498, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.957111045718193, 'clip_ratio/low_mean': 2.827990363130084e-05, 'clip_ratio/low_min': 5.86744272368378e-06, 'clip_ratio/high_mean': 2.8257881012905273e-06, 'clip_ratio/high_max': 1.1303152405162109e-05, 'clip_ratio/region_mean': 3.1105691391530854e-05, 'epoch': 0.02}
+
+  3%|▎         | 27/1024 [1:07:02<41:47:45, 150.92s/it][AINFO 12-01 14:32:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:32:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:32:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:32:02 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 28/1024 [1:09:33<41:47:24, 151.05s/it][A
+                                                       [A{'loss': -0.0023, 'grad_norm': 0.004129618871957064, 'learning_rate': 1e-05, 'num_tokens': 20829064.0, 'completions/mean_length': 5659.5703125, 'completions/min_length': 52.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5489.341796875, 'completions/min_terminated_length': 52.0, 'completions/max_terminated_length': 15274.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3745690584182739, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019232336431741714, 'sampling/sampling_logp_difference/max': 10.124649047851562, 'sampling/importance_sampling_ratio/min': 4.007936149719171e-05, 'sampling/importance_sampling_ratio/mean': 0.9999293088912964, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9482033550739288, 'clip_ratio/low_mean': 3.4569659419503296e-05, 'clip_ratio/low_min': 3.6480373637459707e-06, 'clip_ratio/high_mean': 2.0723034026559617e-06, 'clip_ratio/high_max': 8.289213610623847e-06, 'clip_ratio/region_mean': 3.664196310637635e-05, 'epoch': 0.03}
+
+  3%|▎         | 28/1024 [1:09:33<41:47:24, 151.05s/it][AINFO 12-01 14:34:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:34:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:34:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:34:33 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 29/1024 [1:12:08<42:03:49, 152.19s/it][A
+                                                       [A{'loss': 0.0994, 'grad_norm': 0.004814058542251587, 'learning_rate': 1e-05, 'num_tokens': 21505483.0, 'completions/mean_length': 5122.9609375, 'completions/min_length': 413.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5034.29150390625, 'completions/min_terminated_length': 413.0, 'completions/max_terminated_length': 14558.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.31930169463157654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019156761467456818, 'sampling/sampling_logp_difference/max': 7.87570858001709, 'sampling/importance_sampling_ratio/min': 0.0003798597026616335, 'sampling/importance_sampling_ratio/mean': 0.9999769926071167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.020588956773281, 'clip_ratio/low_mean': 2.9090757720950933e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3835182193797664e-06, 'clip_ratio/high_max': 1.3534072877519066e-05, 'clip_ratio/region_mean': 3.247427605401754e-05, 'epoch': 0.03}
+
+  3%|▎         | 29/1024 [1:12:08<42:03:49, 152.19s/it][AINFO 12-01 14:37:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:37:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:37:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:37:08 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 30/1024 [1:14:28<40:59:08, 148.44s/it][A
+                                                       [A{'loss': 0.0428, 'grad_norm': 0.006946730427443981, 'learning_rate': 1e-05, 'num_tokens': 22142657.0, 'completions/mean_length': 4833.734375, 'completions/min_length': 417.0, 'completions/max_length': 15926.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4833.734375, 'completions/min_terminated_length': 417.0, 'completions/max_terminated_length': 15926.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019841451197862625, 'sampling/sampling_logp_difference/max': 7.701722145080566, 'sampling/importance_sampling_ratio/min': 0.000452048028819263, 'sampling/importance_sampling_ratio/mean': 0.9998937845230103, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0276868790388107, 'clip_ratio/low_mean': 4.13707307416189e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.225991117185913e-06, 'clip_ratio/high_max': 2.0903964468743652e-05, 'clip_ratio/region_mean': 4.659672185880481e-05, 'epoch': 0.03}
+
+  3%|▎         | 30/1024 [1:14:28<40:59:08, 148.44s/it][AINFO 12-01 14:39:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:39:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:39:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:39:28 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 31/1024 [1:17:09<41:59:19, 152.22s/it][A
+                                                       [A{'loss': 0.0397, 'grad_norm': 0.0040768519975245, 'learning_rate': 1e-05, 'num_tokens': 23045931.0, 'completions/mean_length': 6895.390625, 'completions/min_length': 729.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6589.30615234375, 'completions/min_terminated_length': 729.0, 'completions/max_terminated_length': 15122.0, 'rewards/accuracy_reward/mean': 0.1484375, 'rewards/accuracy_reward/std': 0.356930136680603, 'reward': 0.1484375, 'reward_std': 0.20175683498382568, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02265278436243534, 'sampling/sampling_logp_difference/max': 7.406105995178223, 'sampling/importance_sampling_ratio/min': 0.0006075318087823689, 'sampling/importance_sampling_ratio/mean': 0.9999603629112244, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1640124469995499, 'clip_ratio/low_mean': 3.9484380408794095e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1287467006914085e-06, 'clip_ratio/high_max': 8.514986802765634e-06, 'clip_ratio/region_mean': 4.161312688211183e-05, 'epoch': 0.03}
+
+  3%|▎         | 31/1024 [1:17:09<41:59:19, 152.22s/it][AINFO 12-01 14:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:42:09 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 32/1024 [1:19:59<43:25:16, 157.58s/it][A
+                                                       [A{'loss': 0.0853, 'grad_norm': 0.004854958038777113, 'learning_rate': 1e-05, 'num_tokens': 23899259.0, 'completions/mean_length': 6504.0625, 'completions/min_length': 298.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6347.23828125, 'completions/min_terminated_length': 298.0, 'completions/max_terminated_length': 16000.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020535167306661606, 'sampling/sampling_logp_difference/max': 19.505260467529297, 'sampling/importance_sampling_ratio/min': 3.380438373667971e-09, 'sampling/importance_sampling_ratio/mean': 0.999864935874939, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1040372923016548, 'clip_ratio/low_mean': 3.95245172057912e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.95245172057912e-05, 'epoch': 0.03}
+
+  3%|▎         | 32/1024 [1:19:59<43:25:16, 157.58s/it][AINFO 12-01 14:44:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:44:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:44:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:44:59 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 33/1024 [1:22:23<42:17:19, 153.62s/it][A
+                                                       [A{'loss': 0.0258, 'grad_norm': 0.005473555997014046, 'learning_rate': 1e-05, 'num_tokens': 24631956.0, 'completions/mean_length': 5553.2578125, 'completions/min_length': 634.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5467.9765625, 'completions/min_terminated_length': 634.0, 'completions/max_terminated_length': 14787.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29196253418922424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019490022212266922, 'sampling/sampling_logp_difference/max': 5.318830490112305, 'sampling/importance_sampling_ratio/min': 0.004898479674011469, 'sampling/importance_sampling_ratio/mean': 1.0000377893447876, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0357396975159645, 'clip_ratio/low_mean': 4.095688700544997e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.722615699392918e-06, 'clip_ratio/high_max': 1.8890462797571672e-05, 'clip_ratio/region_mean': 4.567950259115605e-05, 'epoch': 0.03}
+
+  3%|▎         | 33/1024 [1:22:23<42:17:19, 153.62s/it][AINFO 12-01 14:47:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:47:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:47:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:47:23 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 34/1024 [1:24:43<41:04:34, 149.37s/it][A
+                                                       [A{'loss': 0.0597, 'grad_norm': 0.003410332603380084, 'learning_rate': 1e-05, 'num_tokens': 25336544.0, 'completions/mean_length': 5357.46875, 'completions/min_length': 4.0, 'completions/max_length': 15753.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5357.46875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15753.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32613158226013184, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01885366439819336, 'sampling/sampling_logp_difference/max': 9.124981880187988, 'sampling/importance_sampling_ratio/min': 0.00010891074634855613, 'sampling/importance_sampling_ratio/mean': 0.9999037384986877, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0332984924316406, 'clip_ratio/low_mean': 2.4400278334724135e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.021345380853745e-06, 'clip_ratio/high_max': 8.08538152341498e-06, 'clip_ratio/region_mean': 2.6421623601891042e-05, 'epoch': 0.03}
+
+  3%|▎         | 34/1024 [1:24:43<41:04:34, 149.37s/it][AINFO 12-01 14:49:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:49:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:49:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:49:42 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 35/1024 [1:27:31<42:35:17, 155.02s/it][A
+                                                       [A{'loss': 0.054, 'grad_norm': 0.003122704103589058, 'learning_rate': 1e-05, 'num_tokens': 26355691.0, 'completions/mean_length': 7812.8984375, 'completions/min_length': 11.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7745.4091796875, 'completions/min_terminated_length': 11.0, 'completions/max_terminated_length': 16210.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.27905434370040894, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022181488573551178, 'sampling/sampling_logp_difference/max': 6.109052658081055, 'sampling/importance_sampling_ratio/min': 0.002222655341029167, 'sampling/importance_sampling_ratio/mean': 0.9999194145202637, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.031004011631012, 'clip_ratio/low_mean': 3.931040214411041e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2372236091759987e-06, 'clip_ratio/high_max': 1.2948894436703995e-05, 'clip_ratio/region_mean': 4.25476254122259e-05, 'epoch': 0.03}
+
+  3%|▎         | 35/1024 [1:27:31<42:35:17, 155.02s/it][AINFO 12-01 14:52:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:52:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:52:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:52:31 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 36/1024 [1:30:29<44:27:39, 162.00s/it][A
+                                                       [A{'loss': 0.0211, 'grad_norm': 0.003292364301159978, 'learning_rate': 1e-05, 'num_tokens': 27193267.0, 'completions/mean_length': 6402.6875, 'completions/min_length': 226.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5825.255859375, 'completions/min_terminated_length': 226.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.30221718549728394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019621271640062332, 'sampling/sampling_logp_difference/max': 14.72463607788086, 'sampling/importance_sampling_ratio/min': 4.0287636693392415e-07, 'sampling/importance_sampling_ratio/mean': 1.0000267028808594, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.96993837505579, 'clip_ratio/low_mean': 3.742906312709238e-05, 'clip_ratio/low_min': 3.3127500955743017e-06, 'clip_ratio/high_mean': 3.2998943879647413e-06, 'clip_ratio/high_max': 1.3199577551858965e-05, 'clip_ratio/region_mean': 4.072895751505712e-05, 'epoch': 0.03}
+
+  4%|▎         | 36/1024 [1:30:29<44:27:39, 162.00s/it][AINFO 12-01 14:55:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:55:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:55:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:55:29 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 37/1024 [1:33:36<46:26:13, 169.38s/it][A
+                                                       [A{'loss': 0.0476, 'grad_norm': 0.004192501772195101, 'learning_rate': 1e-05, 'num_tokens': 28181183.0, 'completions/mean_length': 7525.40625, 'completions/min_length': 701.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7165.30078125, 'completions/min_terminated_length': 701.0, 'completions/max_terminated_length': 14992.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.33797892928123474, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019877666607499123, 'sampling/sampling_logp_difference/max': 11.227011680603027, 'sampling/importance_sampling_ratio/min': 1.3309776477399282e-05, 'sampling/importance_sampling_ratio/mean': 0.9999439716339111, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9819100275635719, 'clip_ratio/low_mean': 4.5394222524919314e-05, 'clip_ratio/low_min': 4.49300887339632e-06, 'clip_ratio/high_mean': 2.27034422550787e-06, 'clip_ratio/high_max': 9.08137690203148e-06, 'clip_ratio/region_mean': 4.766456731886137e-05, 'epoch': 0.03}
+
+  4%|▎         | 37/1024 [1:33:36<46:26:13, 169.38s/it][AINFO 12-01 14:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:58:36 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 38/1024 [1:36:30<46:47:01, 170.81s/it][A
+                                                       [A{'loss': 0.0617, 'grad_norm': 0.0029556062072515488, 'learning_rate': 1e-05, 'num_tokens': 29087384.0, 'completions/mean_length': 6930.8828125, 'completions/min_length': 655.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6625.943359375, 'completions/min_terminated_length': 655.0, 'completions/max_terminated_length': 15572.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2740417718887329, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0197360310703516, 'sampling/sampling_logp_difference/max': 16.87410545349121, 'sampling/importance_sampling_ratio/min': 4.6953626764434375e-08, 'sampling/importance_sampling_ratio/mean': 0.9999445676803589, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9183463454246521, 'clip_ratio/low_mean': 2.638440969349176e-05, 'clip_ratio/low_min': 6.698462129861582e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.638440969349176e-05, 'epoch': 0.03}
+
+  4%|▎         | 38/1024 [1:36:30<46:47:01, 170.81s/it][AINFO 12-01 15:01:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:01:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:01:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:01:30 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 39/1024 [1:38:54<44:30:10, 162.65s/it][A
+                                                       [A{'loss': 0.0042, 'grad_norm': 0.0044409241527318954, 'learning_rate': 1e-05, 'num_tokens': 29860767.0, 'completions/mean_length': 5893.1796875, 'completions/min_length': 466.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5726.6591796875, 'completions/min_terminated_length': 466.0, 'completions/max_terminated_length': 13891.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.3435155153274536, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02058839052915573, 'sampling/sampling_logp_difference/max': 15.3101806640625, 'sampling/importance_sampling_ratio/min': 2.243226049358782e-07, 'sampling/importance_sampling_ratio/mean': 0.9999690055847168, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.05657509714365, 'clip_ratio/low_mean': 3.344960384765727e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.052559569980076e-06, 'clip_ratio/high_max': 2.4210238279920304e-05, 'clip_ratio/region_mean': 3.9502163645011024e-05, 'epoch': 0.04}
+
+  4%|▍         | 39/1024 [1:38:54<44:30:10, 162.65s/it][AINFO 12-01 15:03:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:03:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:03:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:03:53 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 40/1024 [1:40:58<41:16:22, 151.00s/it][A
+                                                       [A{'loss': -0.0198, 'grad_norm': 0.004552105907350779, 'learning_rate': 1e-05, 'num_tokens': 30620388.0, 'completions/mean_length': 5779.4765625, 'completions/min_length': 397.0, 'completions/max_length': 13471.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5779.4765625, 'completions/min_terminated_length': 397.0, 'completions/max_terminated_length': 13471.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.3295513987541199, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020457806065678596, 'sampling/sampling_logp_difference/max': 4.437069416046143, 'sampling/importance_sampling_ratio/min': 0.011830558069050312, 'sampling/importance_sampling_ratio/mean': 0.9999610185623169, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0302623957395554, 'clip_ratio/low_mean': 3.4493159887460934e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.4493159887460934e-05, 'epoch': 0.04}
+
+  4%|▍         | 40/1024 [1:40:58<41:16:22, 151.00s/it][AINFO 12-01 15:05:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:05:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:05:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:05:57 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 41/1024 [1:43:15<40:05:23, 146.82s/it][A
+                                                       [A{'loss': 0.1138, 'grad_norm': 0.003300054930150509, 'learning_rate': 1e-05, 'num_tokens': 31334221.0, 'completions/mean_length': 5429.1328125, 'completions/min_length': 633.0, 'completions/max_length': 13927.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5429.1328125, 'completions/min_terminated_length': 633.0, 'completions/max_terminated_length': 13927.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.23592591285705566, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01882476732134819, 'sampling/sampling_logp_difference/max': 8.623812675476074, 'sampling/importance_sampling_ratio/min': 0.00017977353127207607, 'sampling/importance_sampling_ratio/mean': 0.9999089241027832, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9245247691869736, 'clip_ratio/low_mean': 3.615360617459373e-05, 'clip_ratio/low_min': 4.283315774955554e-06, 'clip_ratio/high_mean': 2.317561666131951e-06, 'clip_ratio/high_max': 9.270246664527804e-06, 'clip_ratio/region_mean': 3.8471167840725684e-05, 'epoch': 0.04}
+
+  4%|▍         | 41/1024 [1:43:15<40:05:23, 146.82s/it][AINFO 12-01 15:08:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:08:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:08:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:08:14 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 42/1024 [1:45:48<40:34:00, 148.72s/it][A
+                                                       [A{'loss': 0.0055, 'grad_norm': 0.004679495934396982, 'learning_rate': 1e-05, 'num_tokens': 32134854.0, 'completions/mean_length': 6073.6328125, 'completions/min_length': 726.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5909.9765625, 'completions/min_terminated_length': 726.0, 'completions/max_terminated_length': 14875.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02106339856982231, 'sampling/sampling_logp_difference/max': 7.393631458282471, 'sampling/importance_sampling_ratio/min': 0.0006151580018922687, 'sampling/importance_sampling_ratio/mean': 0.999961793422699, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0127769336104393, 'clip_ratio/low_mean': 3.780993347390904e-05, 'clip_ratio/low_min': 3.7437480386870448e-06, 'clip_ratio/high_mean': 3.760628430882207e-06, 'clip_ratio/high_max': 1.5042513723528828e-05, 'clip_ratio/region_mean': 4.157056224585176e-05, 'epoch': 0.04}
+
+  4%|▍         | 42/1024 [1:45:48<40:34:00, 148.72s/it][AINFO 12-01 15:10:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:10:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:10:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:10:47 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 43/1024 [1:47:58<39:01:27, 143.21s/it][A
+                                                       [A{'loss': 0.0859, 'grad_norm': 0.00485749589279294, 'learning_rate': 1e-05, 'num_tokens': 32897040.0, 'completions/mean_length': 5773.015625, 'completions/min_length': 4.0, 'completions/max_length': 15052.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5773.015625, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15052.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3595343232154846, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01966444030404091, 'sampling/sampling_logp_difference/max': 12.233952522277832, 'sampling/importance_sampling_ratio/min': 4.862526111537591e-06, 'sampling/importance_sampling_ratio/mean': 1.0000348091125488, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0045431107282639, 'clip_ratio/low_mean': 5.7316304378218774e-05, 'clip_ratio/low_min': 1.412125402566744e-05, 'clip_ratio/high_mean': 4.576835863190354e-06, 'clip_ratio/high_max': 1.8307343452761415e-05, 'clip_ratio/region_mean': 6.189314035509597e-05, 'epoch': 0.04}
+
+  4%|▍         | 43/1024 [1:47:58<39:01:27, 143.21s/it][AINFO 12-01 15:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:12:58 [block_pool.py:292] Successfully reset prefix cache
+Exception ignored in: <function WeakSet.__init__.<locals>._remove at 0x7f84695cce00>
+Traceback (most recent call last):
+  File "/root/miniconda3/lib/python3.11/_weakrefset.py", line 39, in _remove
+    def _remove(item, selfref=ref(self)):
+
+  File "/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/math_verify/utils.py", line 56, in handler
+    raise TimeoutException("Operation timed out!")
+math_verify.errors.TimeoutException: Operation timed out!
+
+  4%|▍         | 44/1024 [1:50:26<39:20:17, 144.51s/it][A
+                                                       [A{'loss': 0.0589, 'grad_norm': 0.004726089537143707, 'learning_rate': 1e-05, 'num_tokens': 33522133.0, 'completions/mean_length': 4731.3515625, 'completions/min_length': 369.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4639.5986328125, 'completions/min_terminated_length': 369.0, 'completions/max_terminated_length': 14572.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019276604056358337, 'sampling/sampling_logp_difference/max': 8.773368835449219, 'sampling/importance_sampling_ratio/min': 0.0001548011932754889, 'sampling/importance_sampling_ratio/mean': 0.9999152421951294, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0001292675733566, 'clip_ratio/low_mean': 1.772546147549292e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.94652681734442e-06, 'clip_ratio/high_max': 1.578610726937768e-05, 'clip_ratio/region_mean': 2.1671988179150503e-05, 'epoch': 0.04}
+
+  4%|▍         | 44/1024 [1:50:26<39:20:17, 144.51s/it][AINFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 45/1024 [1:52:58<39:55:33, 146.82s/it][A
+                                                       [A{'loss': 0.0202, 'grad_norm': 0.0011808272683992982, 'learning_rate': 1e-05, 'num_tokens': 34429384.0, 'completions/mean_length': 6908.8984375, 'completions/min_length': 631.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6834.29150390625, 'completions/min_terminated_length': 631.0, 'completions/max_terminated_length': 15661.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021076666191220284, 'sampling/sampling_logp_difference/max': 7.173947334289551, 'sampling/importance_sampling_ratio/min': 0.0007662919815629721, 'sampling/importance_sampling_ratio/mean': 0.9999626278877258, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0723063945770264, 'clip_ratio/low_mean': 8.259907644969644e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9861447526636766e-06, 'clip_ratio/high_max': 7.944579010654707e-06, 'clip_ratio/region_mean': 1.024605239763332e-05, 'epoch': 0.04}
+
+  4%|▍         | 45/1024 [1:52:58<39:55:33, 146.82s/it][AINFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 46/1024 [1:55:55<42:23:12, 156.03s/it][A
+                                                       [A{'loss': 0.0433, 'grad_norm': 0.003600373398512602, 'learning_rate': 1e-05, 'num_tokens': 35302474.0, 'completions/mean_length': 6679.140625, 'completions/min_length': 828.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6446.22412109375, 'completions/min_terminated_length': 828.0, 'completions/max_terminated_length': 16348.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019808633252978325, 'sampling/sampling_logp_difference/max': 9.312483787536621, 'sampling/importance_sampling_ratio/min': 9.02900064829737e-05, 'sampling/importance_sampling_ratio/mean': 0.9998806715011597, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9413202852010727, 'clip_ratio/low_mean': 2.6357692036071967e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.222089392489579e-06, 'clip_ratio/high_max': 8.888357569958316e-06, 'clip_ratio/region_mean': 2.8579780860127357e-05, 'epoch': 0.04}
+
+  4%|▍         | 46/1024 [1:55:55<42:23:12, 156.03s/it][AINFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 47/1024 [1:58:45<43:25:00, 159.98s/it][A
+                                                       [A{'loss': -0.0024, 'grad_norm': 0.003302425378933549, 'learning_rate': 1e-05, 'num_tokens': 36093941.0, 'completions/mean_length': 5954.5859375, 'completions/min_length': 95.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5872.46435546875, 'completions/min_terminated_length': 95.0, 'completions/max_terminated_length': 16253.0, 'rewards/accuracy_reward/mean': 0.1640625, 'rewards/accuracy_reward/std': 0.371787428855896, 'reward': 0.1640625, 'reward_std': 0.1990984082221985, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022528307512402534, 'sampling/sampling_logp_difference/max': 5.921712875366211, 'sampling/importance_sampling_ratio/min': 0.0026806045789271593, 'sampling/importance_sampling_ratio/mean': 0.9998957514762878, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.200403742492199, 'clip_ratio/low_mean': 1.6833528775350715e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3411616894009057e-06, 'clip_ratio/high_max': 9.364646757603623e-06, 'clip_ratio/region_mean': 1.9174690351064783e-05, 'epoch': 0.04}
+
+  5%|▍         | 47/1024 [1:58:45<43:25:00, 159.98s/it][AINFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 48/1024 [2:01:27<43:34:47, 160.75s/it][A
+                                                       [A{'loss': 0.0979, 'grad_norm': 0.005992463324218988, 'learning_rate': 1e-05, 'num_tokens': 36893486.0, 'completions/mean_length': 6109.1953125, 'completions/min_length': 656.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5946.103515625, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 15867.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.40373340249061584, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018979201093316078, 'sampling/sampling_logp_difference/max': 10.624975204467773, 'sampling/importance_sampling_ratio/min': 2.4301432858919725e-05, 'sampling/importance_sampling_ratio/mean': 0.9999576807022095, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9069097489118576, 'clip_ratio/low_mean': 4.7836430894676596e-05, 'clip_ratio/low_min': 6.161485543998424e-06, 'clip_ratio/high_mean': 3.944288664570195e-06, 'clip_ratio/high_max': 1.2503618108894443e-05, 'clip_ratio/region_mean': 5.1780719331873115e-05, 'epoch': 0.04}
+
+  5%|▍         | 48/1024 [2:01:27<43:34:47, 160.75s/it][AINFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 49/1024 [2:04:09<43:36:24, 161.01s/it][A
+                                                       [A{'loss': 0.1217, 'grad_norm': 0.005304713733494282, 'learning_rate': 1e-05, 'num_tokens': 37716027.0, 'completions/mean_length': 6265.5390625, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6022.6962890625, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15331.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29272884130477905, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019227473065257072, 'sampling/sampling_logp_difference/max': 7.968747615814209, 'sampling/importance_sampling_ratio/min': 0.0003461121814325452, 'sampling/importance_sampling_ratio/mean': 0.9998800754547119, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9107594564557076, 'clip_ratio/low_mean': 2.73638818271138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.768986860246514e-06, 'clip_ratio/high_max': 1.1075947440986056e-05, 'clip_ratio/region_mean': 3.013286891473399e-05, 'epoch': 0.05}
+
+  5%|▍         | 49/1024 [2:04:09<43:36:24, 161.01s/it][AINFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 50/1024 [2:06:53<43:49:28, 161.98s/it][A
+                                                       [A{'loss': 0.0401, 'grad_norm': 0.0017410843865945935, 'learning_rate': 1e-05, 'num_tokens': 38519738.0, 'completions/mean_length': 6143.1796875, 'completions/min_length': 170.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5897.400390625, 'completions/min_terminated_length': 170.0, 'completions/max_terminated_length': 15860.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2301519215106964, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019512062892317772, 'sampling/sampling_logp_difference/max': 5.612663269042969, 'sampling/importance_sampling_ratio/min': 0.0036513316445052624, 'sampling/importance_sampling_ratio/mean': 0.9998773336410522, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9168931543827057, 'clip_ratio/low_mean': 3.135283236588293e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.674950448839809e-06, 'clip_ratio/high_max': 1.0917767667706357e-05, 'clip_ratio/region_mean': 3.50277827010359e-05, 'epoch': 0.05}
+
+  5%|▍         | 50/1024 [2:06:53<43:49:28, 161.98s/it][AINFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 51/1024 [2:09:34<43:42:41, 161.73s/it][A
+                                                       [A{'loss': 0.0544, 'grad_norm': 0.004612576216459274, 'learning_rate': 1e-05, 'num_tokens': 39461012.0, 'completions/mean_length': 7165.265625, 'completions/min_length': 713.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7092.67724609375, 'completions/min_terminated_length': 713.0, 'completions/max_terminated_length': 15616.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.35505855083465576, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0201116893440485, 'sampling/sampling_logp_difference/max': 9.999534606933594, 'sampling/importance_sampling_ratio/min': 4.5421067625284195e-05, 'sampling/importance_sampling_ratio/mean': 1.0000245571136475, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9690218195319176, 'clip_ratio/low_mean': 2.6178069106208568e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7445629459398333e-06, 'clip_ratio/high_max': 5.4981305765977595e-06, 'clip_ratio/region_mean': 2.99226320521484e-05, 'epoch': 0.05}
+
+  5%|▍         | 51/1024 [2:09:34<43:42:41, 161.73s/it][AINFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 52/1024 [2:12:09<43:06:30, 159.66s/it][A
+                                                       [A{'loss': -0.0235, 'grad_norm': 0.003172830445691943, 'learning_rate': 1e-05, 'num_tokens': 40202979.0, 'completions/mean_length': 5617.9296875, 'completions/min_length': 162.0, 'completions/max_length': 16007.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5617.9296875, 'completions/min_terminated_length': 162.0, 'completions/max_terminated_length': 16007.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020904643461108208, 'sampling/sampling_logp_difference/max': 13.609129905700684, 'sampling/importance_sampling_ratio/min': 1.229221084031451e-06, 'sampling/importance_sampling_ratio/mean': 0.9999560117721558, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0479632839560509, 'clip_ratio/low_mean': 2.1866131419301382e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2383335906160937e-06, 'clip_ratio/high_max': 1.2953334362464375e-05, 'clip_ratio/region_mean': 2.5104465066760895e-05, 'epoch': 0.05}
+
+  5%|▌         | 52/1024 [2:12:09<43:06:30, 159.66s/it][AINFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 53/1024 [2:14:34<41:51:45, 155.21s/it][A
+                                                       [A{'loss': 0.0336, 'grad_norm': 0.003333345288410783, 'learning_rate': 1e-05, 'num_tokens': 40989532.0, 'completions/mean_length': 5995.3203125, 'completions/min_length': 397.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5913.51953125, 'completions/min_terminated_length': 397.0, 'completions/max_terminated_length': 16094.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021745413541793823, 'sampling/sampling_logp_difference/max': 9.405362129211426, 'sampling/importance_sampling_ratio/min': 8.228168007917702e-05, 'sampling/importance_sampling_ratio/mean': 0.9999282360076904, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.022934041917324, 'clip_ratio/low_mean': 4.556761541607557e-05, 'clip_ratio/low_min': 8.631802302261349e-06, 'clip_ratio/high_mean': 4.841006557398941e-06, 'clip_ratio/high_max': 1.4129082956060302e-05, 'clip_ratio/region_mean': 5.040862197347451e-05, 'epoch': 0.05}
+
+  5%|▌         | 53/1024 [2:14:34<41:51:45, 155.21s/it][AINFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 54/1024 [2:17:22<42:53:37, 159.19s/it][A
+                                                       [A{'loss': 0.0799, 'grad_norm': 0.005538261961191893, 'learning_rate': 1e-05, 'num_tokens': 41813914.0, 'completions/mean_length': 6297.859375, 'completions/min_length': 1243.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6055.79248046875, 'completions/min_terminated_length': 1243.0, 'completions/max_terminated_length': 15648.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019708994776010513, 'sampling/sampling_logp_difference/max': 8.659659385681152, 'sampling/importance_sampling_ratio/min': 0.00017344337538816035, 'sampling/importance_sampling_ratio/mean': 0.9999532699584961, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9511058703064919, 'clip_ratio/low_mean': 3.960530659696815e-05, 'clip_ratio/low_min': 3.4269107800355414e-06, 'clip_ratio/high_mean': 6.531613848892448e-06, 'clip_ratio/high_max': 2.286436574649997e-05, 'clip_ratio/region_mean': 4.6136920445860596e-05, 'epoch': 0.05}
+
+  5%|▌         | 54/1024 [2:17:22<42:53:37, 159.19s/it][AINFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 55/1024 [2:19:26<40:01:09, 148.68s/it][A
+                                                       [A{'loss': -0.0177, 'grad_norm': 0.0024318129289895296, 'learning_rate': 1e-05, 'num_tokens': 42443288.0, 'completions/mean_length': 4765.046875, 'completions/min_length': 401.0, 'completions/max_length': 14051.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4765.046875, 'completions/min_terminated_length': 401.0, 'completions/max_terminated_length': 14051.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29196253418922424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01920286938548088, 'sampling/sampling_logp_difference/max': 9.175529479980469, 'sampling/importance_sampling_ratio/min': 0.0001035423920257017, 'sampling/importance_sampling_ratio/mean': 0.9999518394470215, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9130316227674484, 'clip_ratio/low_mean': 2.561447990956367e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.143934355241072e-06, 'clip_ratio/high_max': 4.575737420964288e-06, 'clip_ratio/region_mean': 2.6758414151117904e-05, 'epoch': 0.05}
+
+  5%|▌         | 55/1024 [2:19:26<40:01:09, 148.68s/it][AINFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 56/1024 [2:22:27<42:32:06, 158.19s/it][A
+                                                       [A{'loss': -0.0036, 'grad_norm': 0.0018957280553877354, 'learning_rate': 1e-05, 'num_tokens': 43287600.0, 'completions/mean_length': 6411.5, 'completions/min_length': 321.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5834.578125, 'completions/min_terminated_length': 321.0, 'completions/max_terminated_length': 15445.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.1990983933210373, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018554572016000748, 'sampling/sampling_logp_difference/max': 6.124218463897705, 'sampling/importance_sampling_ratio/min': 0.0021892013028264046, 'sampling/importance_sampling_ratio/mean': 0.9999212622642517, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8110766112804413, 'clip_ratio/low_mean': 4.221943618176738e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.771039933373686e-06, 'clip_ratio/high_max': 7.084159733494744e-06, 'clip_ratio/region_mean': 4.3990476115141064e-05, 'epoch': 0.05}
+
+  5%|▌         | 56/1024 [2:22:27<42:32:06, 158.19s/it][AINFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 57/1024 [2:25:02<42:14:27, 157.26s/it][A
+                                                       [A{'loss': 0.0274, 'grad_norm': 0.002431448083370924, 'learning_rate': 1e-05, 'num_tokens': 44145524.0, 'completions/mean_length': 6552.40625, 'completions/min_length': 348.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6235.2578125, 'completions/min_terminated_length': 348.0, 'completions/max_terminated_length': 15508.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.3114011883735657, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020403606817126274, 'sampling/sampling_logp_difference/max': 2.974147081375122, 'sampling/importance_sampling_ratio/min': 0.051090992987155914, 'sampling/importance_sampling_ratio/mean': 0.999876081943512, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0034996420145035, 'clip_ratio/low_mean': 4.334260950145108e-05, 'clip_ratio/low_min': 8.570448699174449e-06, 'clip_ratio/high_mean': 1.6897372461244231e-06, 'clip_ratio/high_max': 6.7589489844976924e-06, 'clip_ratio/region_mean': 4.503234697494918e-05, 'epoch': 0.05}
+
+  6%|▌         | 57/1024 [2:25:02<42:14:27, 157.26s/it][AINFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 58/1024 [2:27:19<40:33:14, 151.13s/it][A
+                                                       [A{'loss': 0.0344, 'grad_norm': 0.004493447951972485, 'learning_rate': 1e-05, 'num_tokens': 44763895.0, 'completions/mean_length': 4688.7734375, 'completions/min_length': 345.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4408.08837890625, 'completions/min_terminated_length': 345.0, 'completions/max_terminated_length': 13257.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.26196980476379395, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01916680857539177, 'sampling/sampling_logp_difference/max': 10.364669799804688, 'sampling/importance_sampling_ratio/min': 3.1526888051303104e-05, 'sampling/importance_sampling_ratio/mean': 0.9999460577964783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9620971381664276, 'clip_ratio/low_mean': 1.0045687076853937e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.422987098630983e-06, 'clip_ratio/high_max': 2.1032463337178342e-05, 'clip_ratio/region_mean': 1.646867417548492e-05, 'epoch': 0.05}
+
+  6%|▌         | 58/1024 [2:27:19<40:33:14, 151.13s/it][AINFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 59/1024 [2:29:44<40:01:00, 149.29s/it][A
+                                                       [A{'loss': 0.0813, 'grad_norm': 0.0049595762975513935, 'learning_rate': 1e-05, 'num_tokens': 45470335.0, 'completions/mean_length': 5381.1875, 'completions/min_length': 25.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5294.55126953125, 'completions/min_terminated_length': 25.0, 'completions/max_terminated_length': 14591.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3090519607067108, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020656142383813858, 'sampling/sampling_logp_difference/max': 15.624994277954102, 'sampling/importance_sampling_ratio/min': 1.6373864752949885e-07, 'sampling/importance_sampling_ratio/mean': 0.9998573660850525, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0265433564782143, 'clip_ratio/low_mean': 2.8500278403953416e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.691486530347902e-06, 'clip_ratio/high_max': 3.076594612139161e-05, 'clip_ratio/region_mean': 3.619176493430132e-05, 'epoch': 0.05}
+
+  6%|▌         | 59/1024 [2:29:44<40:01:00, 149.29s/it][AINFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 60/1024 [2:32:13<39:57:14, 149.21s/it][A
+                                                       [A{'loss': 0.068, 'grad_norm': 0.00655899103730917, 'learning_rate': 1e-05, 'num_tokens': 46206971.0, 'completions/mean_length': 5613.84375, 'completions/min_length': 55.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5529.03955078125, 'completions/min_terminated_length': 55.0, 'completions/max_terminated_length': 15006.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3090519607067108, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020495962351560593, 'sampling/sampling_logp_difference/max': 3.4162673950195312, 'sampling/importance_sampling_ratio/min': 0.03283476456999779, 'sampling/importance_sampling_ratio/mean': 0.999952495098114, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0289503335952759, 'clip_ratio/low_mean': 3.143254116366734e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.581610250577796e-06, 'clip_ratio/high_max': 2.6326441002311185e-05, 'clip_ratio/region_mean': 3.8014151868992485e-05, 'epoch': 0.06}
+
+  6%|▌         | 60/1024 [2:32:13<39:57:14, 149.21s/it][AINFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 61/1024 [2:34:47<40:20:55, 150.84s/it][A
+                                                       [A{'loss': 0.0459, 'grad_norm': 0.007459669373929501, 'learning_rate': 1e-05, 'num_tokens': 46940112.0, 'completions/mean_length': 5577.2890625, 'completions/min_length': 784.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5492.19677734375, 'completions/min_terminated_length': 784.0, 'completions/max_terminated_length': 14763.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.39082521200180054, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018994126468896866, 'sampling/sampling_logp_difference/max': 14.014364242553711, 'sampling/importance_sampling_ratio/min': 8.196697649509588e-07, 'sampling/importance_sampling_ratio/mean': 1.0000065565109253, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9836367890238762, 'clip_ratio/low_mean': 3.3687326776998816e-05, 'clip_ratio/low_min': 5.745277576352237e-06, 'clip_ratio/high_mean': 8.083893476396042e-06, 'clip_ratio/high_max': 3.233557390558417e-05, 'clip_ratio/region_mean': 4.1771219912334345e-05, 'epoch': 0.06}
+
+  6%|▌         | 61/1024 [2:34:47<40:20:55, 150.84s/it][AINFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-01 16:01:44,103 - math_verify.grader - WARNING - Timeout during comparison
+
+  6%|▌         | 62/1024 [2:37:45<42:28:16, 158.94s/it][A
+                                                       [A{'loss': -0.0013, 'grad_norm': 0.005132914055138826, 'learning_rate': 1e-05, 'num_tokens': 47796514.0, 'completions/mean_length': 6547.140625, 'completions/min_length': 266.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6311.05615234375, 'completions/min_terminated_length': 266.0, 'completions/max_terminated_length': 16273.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2751026153564453, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02021491341292858, 'sampling/sampling_logp_difference/max': 7.597993850708008, 'sampling/importance_sampling_ratio/min': 0.0005014563794247806, 'sampling/importance_sampling_ratio/mean': 0.999970018863678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9028418883681297, 'clip_ratio/low_mean': 3.032099141364597e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.300606747165148e-06, 'clip_ratio/high_max': 1.720242698866059e-05, 'clip_ratio/region_mean': 3.462159838818479e-05, 'epoch': 0.06}
+
+  6%|▌         | 62/1024 [2:37:45<42:28:16, 158.94s/it][AINFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 63/1024 [2:41:06<45:48:17, 171.59s/it][A
+                                                       [A{'loss': 0.0196, 'grad_norm': 0.0034147046972066164, 'learning_rate': 1e-05, 'num_tokens': 48765386.0, 'completions/mean_length': 7409.3125, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6811.00048828125, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.27198708057403564, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01943383738398552, 'sampling/sampling_logp_difference/max': 12.379810333251953, 'sampling/importance_sampling_ratio/min': 4.202586751489434e-06, 'sampling/importance_sampling_ratio/mean': 0.9998997449874878, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8614663332700729, 'clip_ratio/low_mean': 2.838153790207798e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.695532941743295e-06, 'clip_ratio/high_max': 1.078213176697318e-05, 'clip_ratio/region_mean': 3.1077070843821275e-05, 'epoch': 0.06}
+
+  6%|▌         | 63/1024 [2:41:06<45:48:17, 171.59s/it][AINFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▋         | 64/1024 [2:43:38<44:11:06, 165.69s/it][A
+                                                       [A{'loss': 0.0371, 'grad_norm': 0.004101228900253773, 'learning_rate': 1e-05, 'num_tokens': 49606280.0, 'completions/mean_length': 6420.859375, 'completions/min_length': 273.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6181.744140625, 'completions/min_terminated_length': 273.0, 'completions/max_terminated_length': 14591.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01929381489753723, 'sampling/sampling_logp_difference/max': 8.258644104003906, 'sampling/importance_sampling_ratio/min': 0.000259009946603328, 'sampling/importance_sampling_ratio/mean': 1.0000226497650146, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9671022593975067, 'clip_ratio/low_mean': 3.695166174111364e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8833828764618374e-06, 'clip_ratio/high_max': 1.153353150584735e-05, 'clip_ratio/region_mean': 3.98350443902018e-05, 'epoch': 0.06}
+
+  6%|▋         | 64/1024 [2:43:38<44:11:06, 165.69s/it][AINFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▋         | 65/1024 [2:46:04<42:31:38, 159.64s/it][A
+                                                       [A{'loss': 0.0601, 'grad_norm': 0.0077895247377455235, 'learning_rate': 1e-05, 'num_tokens': 50246457.0, 'completions/mean_length': 4852.7578125, 'completions/min_length': 92.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4761.96044921875, 'completions/min_terminated_length': 92.0, 'completions/max_terminated_length': 14971.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.35400262475013733, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01895500347018242, 'sampling/sampling_logp_difference/max': 10.624988555908203, 'sampling/importance_sampling_ratio/min': 2.4301109078805894e-05, 'sampling/importance_sampling_ratio/mean': 0.9999773502349854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9933939427137375, 'clip_ratio/low_mean': 4.231768923546042e-05, 'clip_ratio/low_min': 5.164009053260088e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.231768923546042e-05, 'epoch': 0.06}
+
+  6%|▋         | 65/1024 [2:46:04<42:31:38, 159.64s/it][AINFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▋         | 66/1024 [2:48:45<42:38:52, 160.26s/it][A
+                                                       [A{'loss': 0.0534, 'grad_norm': 0.00207411777228117, 'learning_rate': 1e-05, 'num_tokens': 51141597.0, 'completions/mean_length': 6840.03125, 'completions/min_length': 728.0, 'completions/max_length': 15610.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6840.03125, 'completions/min_terminated_length': 728.0, 'completions/max_terminated_length': 15610.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2790592312812805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02091015875339508, 'sampling/sampling_logp_difference/max': 15.411253929138184, 'sampling/importance_sampling_ratio/min': 2.0275774659239687e-07, 'sampling/importance_sampling_ratio/mean': 0.9999240636825562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9959733113646507, 'clip_ratio/low_mean': 3.009997408298659e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.009997408298659e-05, 'epoch': 0.06}
+
+  6%|▋         | 66/1024 [2:48:45<42:38:52, 160.26s/it][AINFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 67/1024 [2:51:17<41:53:20, 157.58s/it][A
+                                                       [A{'loss': 0.0238, 'grad_norm': 0.006496666464954615, 'learning_rate': 1e-05, 'num_tokens': 52001758.0, 'completions/mean_length': 6567.3828125, 'completions/min_length': 234.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6331.7841796875, 'completions/min_terminated_length': 234.0, 'completions/max_terminated_length': 15249.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021580250933766365, 'sampling/sampling_logp_difference/max': 5.936847686767578, 'sampling/importance_sampling_ratio/min': 0.0026403397787362337, 'sampling/importance_sampling_ratio/mean': 0.9999523162841797, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0921807065606117, 'clip_ratio/low_mean': 4.6152885829542356e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.853683203189576e-06, 'clip_ratio/high_max': 2.297391938554938e-05, 'clip_ratio/region_mean': 5.3006569942226633e-05, 'epoch': 0.06}
+
+  7%|▋         | 67/1024 [2:51:17<41:53:20, 157.58s/it][AINFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 68/1024 [2:54:24<44:14:08, 166.58s/it][A
+                                                       [A{'loss': 0.021, 'grad_norm': 0.002272722776979208, 'learning_rate': 1e-05, 'num_tokens': 52907256.0, 'completions/mean_length': 6927.265625, 'completions/min_length': 781.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6542.84521484375, 'completions/min_terminated_length': 781.0, 'completions/max_terminated_length': 16336.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.22673700749874115, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01844738982617855, 'sampling/sampling_logp_difference/max': 16.51754379272461, 'sampling/importance_sampling_ratio/min': 6.70690099013882e-08, 'sampling/importance_sampling_ratio/mean': 0.9999938011169434, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8170016556978226, 'clip_ratio/low_mean': 1.7558751551405294e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0726623663213104e-06, 'clip_ratio/high_max': 1.2290649465285242e-05, 'clip_ratio/region_mean': 2.0631413917726604e-05, 'epoch': 0.06}
+
+  7%|▋         | 68/1024 [2:54:24<44:14:08, 166.58s/it][AINFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 69/1024 [2:56:59<43:14:08, 162.98s/it][A
+                                                       [A{'loss': 0.0382, 'grad_norm': 0.005651532672345638, 'learning_rate': 1e-05, 'num_tokens': 53682100.0, 'completions/mean_length': 5889.28125, 'completions/min_length': 260.0, 'completions/max_length': 16228.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5889.28125, 'completions/min_terminated_length': 260.0, 'completions/max_terminated_length': 16228.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.32613158226013184, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020069826394319534, 'sampling/sampling_logp_difference/max': 14.67677116394043, 'sampling/importance_sampling_ratio/min': 4.226289718189946e-07, 'sampling/importance_sampling_ratio/mean': 0.9998855590820312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0794919431209564, 'clip_ratio/low_mean': 5.522496246612718e-05, 'clip_ratio/low_min': 4.129910394112812e-06, 'clip_ratio/high_mean': 4.526967131823767e-06, 'clip_ratio/high_max': 1.016177520796191e-05, 'clip_ratio/region_mean': 5.9751928688456246e-05, 'epoch': 0.06}
+
+  7%|▋         | 69/1024 [2:56:59<43:14:08, 162.98s/it][AINFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 70/1024 [2:59:38<42:52:52, 161.82s/it][A
+                                                       [A{'loss': 0.0246, 'grad_norm': 0.002985857194289565, 'learning_rate': 1e-05, 'num_tokens': 54456508.0, 'completions/mean_length': 5909.3125, 'completions/min_length': 197.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5394.16357421875, 'completions/min_terminated_length': 197.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01927822455763817, 'sampling/sampling_logp_difference/max': 7.699061393737793, 'sampling/importance_sampling_ratio/min': 0.000453252432635054, 'sampling/importance_sampling_ratio/mean': 0.999995231628418, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8462172821164131, 'clip_ratio/low_mean': 4.575056436806335e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4492417221845244e-06, 'clip_ratio/high_max': 5.796966888738098e-06, 'clip_ratio/region_mean': 4.719980597656104e-05, 'epoch': 0.06}
+
+  7%|▋         | 70/1024 [2:59:38<42:52:52, 161.82s/it][AINFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 71/1024 [3:02:39<44:23:22, 167.68s/it][A
+                                                       [A{'loss': 0.0218, 'grad_norm': 0.0036494233645498753, 'learning_rate': 1e-05, 'num_tokens': 55429663.0, 'completions/mean_length': 7465.3984375, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7177.701171875, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15579.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01909823715686798, 'sampling/sampling_logp_difference/max': 6.343155384063721, 'sampling/importance_sampling_ratio/min': 0.0017587440088391304, 'sampling/importance_sampling_ratio/mean': 0.9998987913131714, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8792542889714241, 'clip_ratio/low_mean': 3.1553636290482245e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.821615673085034e-06, 'clip_ratio/high_max': 1.8927265045931563e-05, 'clip_ratio/region_mean': 3.737525207725412e-05, 'epoch': 0.07}
+
+  7%|▋         | 71/1024 [3:02:39<44:23:22, 167.68s/it][AINFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 72/1024 [3:05:16<43:27:18, 164.33s/it][A
+                                                       [A{'loss': 0.0295, 'grad_norm': 0.003951186314225197, 'learning_rate': 1e-05, 'num_tokens': 56173314.0, 'completions/mean_length': 5674.9609375, 'completions/min_length': 71.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5590.6376953125, 'completions/min_terminated_length': 71.0, 'completions/max_terminated_length': 15670.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.29249149560928345, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01932360976934433, 'sampling/sampling_logp_difference/max': 5.742441177368164, 'sampling/importance_sampling_ratio/min': 0.003206930123269558, 'sampling/importance_sampling_ratio/mean': 0.9999845623970032, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9117730036377907, 'clip_ratio/low_mean': 3.611839565564878e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1965249743516324e-06, 'clip_ratio/high_max': 8.78609989740653e-06, 'clip_ratio/region_mean': 3.831492040262674e-05, 'epoch': 0.07}
+
+  7%|▋         | 72/1024 [3:05:16<43:27:18, 164.33s/it][AINFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 73/1024 [3:07:27<40:47:22, 154.41s/it][A
+                                                       [A{'loss': 0.0328, 'grad_norm': 0.005329386796802282, 'learning_rate': 1e-05, 'num_tokens': 56799911.0, 'completions/mean_length': 4754.5390625, 'completions/min_length': 291.0, 'completions/max_length': 16325.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4754.5390625, 'completions/min_terminated_length': 291.0, 'completions/max_terminated_length': 16325.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.4111049771308899, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.01792578026652336, 'sampling/sampling_logp_difference/max': 9.36398696899414, 'sampling/importance_sampling_ratio/min': 8.575750689487904e-05, 'sampling/importance_sampling_ratio/mean': 0.9999337196350098, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8350499644875526, 'clip_ratio/low_mean': 4.657158876852918e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.872955512131739e-06, 'clip_ratio/high_max': 1.7587798083695816e-05, 'clip_ratio/region_mean': 5.244454393960041e-05, 'epoch': 0.07}
+
+  7%|▋         | 73/1024 [3:07:27<40:47:22, 154.41s/it][AINFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 74/1024 [3:10:06<41:04:54, 155.68s/it][A
+                                                       [A{'loss': 0.082, 'grad_norm': 0.0036763548851013184, 'learning_rate': 1e-05, 'num_tokens': 57553986.0, 'completions/mean_length': 5744.2734375, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5488.92041015625, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16316.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018098725005984306, 'sampling/sampling_logp_difference/max': 9.082645416259766, 'sampling/importance_sampling_ratio/min': 0.00011362064105924219, 'sampling/importance_sampling_ratio/mean': 0.9999231696128845, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8065197095274925, 'clip_ratio/low_mean': 1.8536085917730816e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1438435144082177e-06, 'clip_ratio/high_max': 1.2575374057632871e-05, 'clip_ratio/region_mean': 2.1679929204765358e-05, 'epoch': 0.07}
+
+  7%|▋         | 74/1024 [3:10:06<41:04:54, 155.68s/it][AINFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 75/1024 [3:12:46<41:24:05, 157.06s/it][A
+                                                       [A{'loss': 0.0139, 'grad_norm': 0.0038320303428918123, 'learning_rate': 1e-05, 'num_tokens': 58438333.0, 'completions/mean_length': 6754.5234375, 'completions/min_length': 638.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6523.41650390625, 'completions/min_terminated_length': 638.0, 'completions/max_terminated_length': 16088.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2369818389415741, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02173798717558384, 'sampling/sampling_logp_difference/max': 12.989178657531738, 'sampling/importance_sampling_ratio/min': 2.284922175022075e-06, 'sampling/importance_sampling_ratio/mean': 0.9999582767486572, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.013127624988556, 'clip_ratio/low_mean': 2.6290458890798618e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.101248914092139e-06, 'clip_ratio/high_max': 1.877081149359583e-05, 'clip_ratio/region_mean': 3.239170769120392e-05, 'epoch': 0.07}
+
+  7%|▋         | 75/1024 [3:12:46<41:24:05, 157.06s/it][AINFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 76/1024 [3:15:20<41:07:09, 156.15s/it][A
+                                                       [A{'loss': 0.0483, 'grad_norm': 0.004985450301319361, 'learning_rate': 1e-05, 'num_tokens': 59249562.0, 'completions/mean_length': 6203.5390625, 'completions/min_length': 408.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6123.3779296875, 'completions/min_terminated_length': 408.0, 'completions/max_terminated_length': 12421.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019999932497739792, 'sampling/sampling_logp_difference/max': 5.3917694091796875, 'sampling/importance_sampling_ratio/min': 0.004553908482193947, 'sampling/importance_sampling_ratio/mean': 0.9999778270721436, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0302691981196404, 'clip_ratio/low_mean': 3.252214798976638e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.682960474790889e-06, 'clip_ratio/high_max': 1.9026635982299922e-05, 'clip_ratio/region_mean': 3.920510800980992e-05, 'epoch': 0.07}
+
+  7%|▋         | 76/1024 [3:15:20<41:07:09, 156.15s/it][AINFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 77/1024 [3:17:44<40:08:15, 152.58s/it][A
+                                                       [A{'loss': 0.0236, 'grad_norm': 0.0037541294004768133, 'learning_rate': 1e-05, 'num_tokens': 60001208.0, 'completions/mean_length': 5727.796875, 'completions/min_length': 743.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5643.8896484375, 'completions/min_terminated_length': 743.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.20753079652786255, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020555900409817696, 'sampling/sampling_logp_difference/max': 8.400880813598633, 'sampling/importance_sampling_ratio/min': 0.00022466933296527714, 'sampling/importance_sampling_ratio/mean': 0.9999213218688965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9781062752008438, 'clip_ratio/low_mean': 3.63567767180939e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4910855220005033e-06, 'clip_ratio/high_max': 1.3964342088002013e-05, 'clip_ratio/region_mean': 3.984786212640756e-05, 'epoch': 0.07}
+
+  8%|▊         | 77/1024 [3:17:44<40:08:15, 152.58s/it][AINFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 78/1024 [3:20:36<41:33:37, 158.16s/it][A
+                                                       [A{'loss': 0.0754, 'grad_norm': 0.007178841158747673, 'learning_rate': 1e-05, 'num_tokens': 60777899.0, 'completions/mean_length': 5923.8359375, 'completions/min_length': 597.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5409.4013671875, 'completions/min_terminated_length': 597.0, 'completions/max_terminated_length': 15720.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2977364659309387, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019868161529302597, 'sampling/sampling_logp_difference/max': 7.621582508087158, 'sampling/importance_sampling_ratio/min': 0.0004897661856375635, 'sampling/importance_sampling_ratio/mean': 0.9999773502349854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9449758678674698, 'clip_ratio/low_mean': 3.516969627526123e-05, 'clip_ratio/low_min': 4.025116595585132e-06, 'clip_ratio/high_mean': 6.949231874386896e-07, 'clip_ratio/high_max': 2.7796927497547586e-06, 'clip_ratio/region_mean': 3.586461934901308e-05, 'epoch': 0.07}
+
+  8%|▊         | 78/1024 [3:20:36<41:33:37, 158.16s/it][AINFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 79/1024 [3:23:03<40:38:16, 154.81s/it][A
+                                                       [A{'loss': 0.0136, 'grad_norm': 0.004776299465447664, 'learning_rate': 1e-05, 'num_tokens': 61587141.0, 'completions/mean_length': 6171.640625, 'completions/min_length': 721.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5926.54443359375, 'completions/min_terminated_length': 721.0, 'completions/max_terminated_length': 14267.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.36113685369491577, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019484341144561768, 'sampling/sampling_logp_difference/max': 10.124996185302734, 'sampling/importance_sampling_ratio/min': 4.0065449866233394e-05, 'sampling/importance_sampling_ratio/mean': 0.999945878982544, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8597526922821999, 'clip_ratio/low_mean': 4.3257180891487224e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.016423746288638e-06, 'clip_ratio/high_max': 2.7642782697512303e-05, 'clip_ratio/region_mean': 5.227360486514954e-05, 'epoch': 0.07}
+
+  8%|▊         | 79/1024 [3:23:03<40:38:16, 154.81s/it][AINFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 80/1024 [3:25:28<39:52:32, 152.07s/it][A
+                                                       [A{'loss': 0.0539, 'grad_norm': 0.007431659381836653, 'learning_rate': 1e-05, 'num_tokens': 62308321.0, 'completions/mean_length': 5501.59375, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5415.9052734375, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 15310.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.400318443775177, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019636545330286026, 'sampling/sampling_logp_difference/max': 9.999296188354492, 'sampling/importance_sampling_ratio/min': 4.54318942502141e-05, 'sampling/importance_sampling_ratio/mean': 1.0000393390655518, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9705724790692329, 'clip_ratio/low_mean': 3.6077020070024446e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.036492244566034e-05, 'clip_ratio/high_max': 4.145968978264136e-05, 'clip_ratio/region_mean': 4.644194200409402e-05, 'epoch': 0.07}
+
+  8%|▊         | 80/1024 [3:25:28<39:52:32, 152.07s/it][AINFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 81/1024 [3:27:55<39:25:09, 150.49s/it][A
+                                                       [A{'loss': 0.0797, 'grad_norm': 0.005465450696647167, 'learning_rate': 1e-05, 'num_tokens': 63084113.0, 'completions/mean_length': 5908.125, 'completions/min_length': 504.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5825.6376953125, 'completions/min_terminated_length': 504.0, 'completions/max_terminated_length': 15781.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.39400771260261536, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018073562532663345, 'sampling/sampling_logp_difference/max': 9.951221466064453, 'sampling/importance_sampling_ratio/min': 4.766937126987614e-05, 'sampling/importance_sampling_ratio/mean': 0.9999576210975647, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8575867265462875, 'clip_ratio/low_mean': 6.429905033655814e-05, 'clip_ratio/low_min': 6.3626184783061035e-06, 'clip_ratio/high_mean': 1.081801542568428e-06, 'clip_ratio/high_max': 4.327206170273712e-06, 'clip_ratio/region_mean': 6.538085153806605e-05, 'epoch': 0.07}
+
+  8%|▊         | 81/1024 [3:27:55<39:25:09, 150.49s/it][AINFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 82/1024 [3:30:19<38:49:45, 148.39s/it][A
+                                                       [A{'loss': -0.0104, 'grad_norm': 0.003077819012105465, 'learning_rate': 1e-05, 'num_tokens': 63740015.0, 'completions/mean_length': 4906.734375, 'completions/min_length': 108.0, 'completions/max_length': 15981.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4906.734375, 'completions/min_terminated_length': 108.0, 'completions/max_terminated_length': 15981.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2251344621181488, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01949312724173069, 'sampling/sampling_logp_difference/max': 9.879111289978027, 'sampling/importance_sampling_ratio/min': 5.1233790145488456e-05, 'sampling/importance_sampling_ratio/mean': 1.000091791152954, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9647495672106743, 'clip_ratio/low_mean': 3.040744320514932e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6878207134141121e-06, 'clip_ratio/high_max': 6.7512828536564484e-06, 'clip_ratio/region_mean': 3.209526391856343e-05, 'epoch': 0.08}
+
+  8%|▊         | 82/1024 [3:30:19<38:49:45, 148.39s/it][AINFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 83/1024 [3:32:31<37:31:47, 143.58s/it][A
+                                                       [A{'loss': 0.0364, 'grad_norm': 0.0018245981773361564, 'learning_rate': 1e-05, 'num_tokens': 64450515.0, 'completions/mean_length': 5402.78125, 'completions/min_length': 277.0, 'completions/max_length': 15716.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5402.78125, 'completions/min_terminated_length': 277.0, 'completions/max_terminated_length': 15716.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019615523517131805, 'sampling/sampling_logp_difference/max': 6.93695592880249, 'sampling/importance_sampling_ratio/min': 0.0009712215978652239, 'sampling/importance_sampling_ratio/mean': 0.9999257922172546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9809223562479019, 'clip_ratio/low_mean': 3.626802561029763e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8155938050767872e-06, 'clip_ratio/high_max': 7.262375220307149e-06, 'clip_ratio/region_mean': 3.8083618960627064e-05, 'epoch': 0.08}
+
+  8%|▊         | 83/1024 [3:32:31<37:31:47, 143.58s/it][AINFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 84/1024 [3:35:26<39:57:20, 153.02s/it][A
+                                                       [A{'loss': 0.0645, 'grad_norm': 0.006053395569324493, 'learning_rate': 1e-05, 'num_tokens': 65269285.0, 'completions/mean_length': 6198.703125, 'completions/min_length': 265.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5870.14501953125, 'completions/min_terminated_length': 265.0, 'completions/max_terminated_length': 16329.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3464113473892212, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01869945600628853, 'sampling/sampling_logp_difference/max': 6.874996662139893, 'sampling/importance_sampling_ratio/min': 0.0010333011159673333, 'sampling/importance_sampling_ratio/mean': 0.9999875426292419, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8571672514081001, 'clip_ratio/low_mean': 4.734331901090627e-05, 'clip_ratio/low_min': 1.1585900665522786e-05, 'clip_ratio/high_mean': 2.9435553301482287e-06, 'clip_ratio/high_max': 1.1774221320592915e-05, 'clip_ratio/region_mean': 5.0286874625271594e-05, 'epoch': 0.08}
+
+  8%|▊         | 84/1024 [3:35:26<39:57:20, 153.02s/it][AINFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 85/1024 [3:38:03<40:12:00, 154.12s/it][A
+                                                       [A{'loss': 0.0681, 'grad_norm': 0.0030623299535363913, 'learning_rate': 1e-05, 'num_tokens': 66058473.0, 'completions/mean_length': 6016.09375, 'completions/min_length': 370.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5851.52392578125, 'completions/min_terminated_length': 370.0, 'completions/max_terminated_length': 15972.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.24883407354354858, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02085939608514309, 'sampling/sampling_logp_difference/max': 6.4199748039245605, 'sampling/importance_sampling_ratio/min': 0.0016286972677335143, 'sampling/importance_sampling_ratio/mean': 0.9999305009841919, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9883866459131241, 'clip_ratio/low_mean': 3.2358174394175876e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.78695198278001e-06, 'clip_ratio/high_max': 2.7282983865006827e-05, 'clip_ratio/region_mean': 4.0145126376955886e-05, 'epoch': 0.08}
+
+  8%|▊         | 85/1024 [3:38:03<40:12:00, 154.12s/it][AINFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 86/1024 [3:40:50<41:13:05, 158.19s/it][A
+                                                       [A{'loss': 0.0389, 'grad_norm': 0.0038264680188149214, 'learning_rate': 1e-05, 'num_tokens': 66984285.0, 'completions/mean_length': 7072.53125, 'completions/min_length': 48.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6924.73046875, 'completions/min_terminated_length': 48.0, 'completions/max_terminated_length': 15594.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021116644144058228, 'sampling/sampling_logp_difference/max': 6.17248010635376, 'sampling/importance_sampling_ratio/min': 0.0020860559307038784, 'sampling/importance_sampling_ratio/mean': 0.9999492764472961, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0157204791903496, 'clip_ratio/low_mean': 3.9277208315979806e-05, 'clip_ratio/low_min': 4.51475443696836e-06, 'clip_ratio/high_mean': 7.449344252563606e-07, 'clip_ratio/high_max': 2.9797377010254422e-06, 'clip_ratio/region_mean': 4.002214268439275e-05, 'epoch': 0.08}
+
+  8%|▊         | 86/1024 [3:40:50<41:13:05, 158.19s/it][AINFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 87/1024 [3:43:28<41:06:57, 157.97s/it][A
+                                                       [A{'loss': 0.0583, 'grad_norm': 0.0044838739559054375, 'learning_rate': 1e-05, 'num_tokens': 67840310.0, 'completions/mean_length': 6539.8203125, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6303.56005859375, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15923.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2722293734550476, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020990263670682907, 'sampling/sampling_logp_difference/max': 6.374995231628418, 'sampling/importance_sampling_ratio/min': 0.001703627873212099, 'sampling/importance_sampling_ratio/mean': 0.9999875426292419, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0071343630552292, 'clip_ratio/low_mean': 3.757404465432046e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5179480215010699e-06, 'clip_ratio/high_max': 6.0717920860042796e-06, 'clip_ratio/region_mean': 3.909199278950837e-05, 'epoch': 0.08}
+
+  8%|▊         | 87/1024 [3:43:28<41:06:57, 157.97s/it][AINFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▊         | 88/1024 [3:46:23<42:24:42, 163.12s/it][A
+                                                       [A{'loss': -0.0057, 'grad_norm': 0.0034659637603908777, 'learning_rate': 1e-05, 'num_tokens': 68782042.0, 'completions/mean_length': 7204.09375, 'completions/min_length': 42.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6907.9677734375, 'completions/min_terminated_length': 42.0, 'completions/max_terminated_length': 16224.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.27958327531814575, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02053149789571762, 'sampling/sampling_logp_difference/max': 8.002180099487305, 'sampling/importance_sampling_ratio/min': 0.0003347320598550141, 'sampling/importance_sampling_ratio/mean': 0.9999324083328247, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9221752807497978, 'clip_ratio/low_mean': 3.50394579982094e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.14752542307906e-06, 'clip_ratio/high_max': 2.859010169231624e-05, 'clip_ratio/region_mean': 4.218698381919239e-05, 'epoch': 0.08}
+
+  9%|▊         | 88/1024 [3:46:23<42:24:42, 163.12s/it][AINFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▊         | 89/1024 [3:49:00<41:53:39, 161.30s/it][A
+                                                       [A{'loss': 0.0584, 'grad_norm': 0.0024458845146000385, 'learning_rate': 1e-05, 'num_tokens': 69526295.0, 'completions/mean_length': 5662.1640625, 'completions/min_length': 391.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5577.740234375, 'completions/min_terminated_length': 391.0, 'completions/max_terminated_length': 14764.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.18543373048305511, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018346723169088364, 'sampling/sampling_logp_difference/max': 5.6851115226745605, 'sampling/importance_sampling_ratio/min': 0.0033961546141654253, 'sampling/importance_sampling_ratio/mean': 0.9999278783798218, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9678512960672379, 'clip_ratio/low_mean': 2.086669928758056e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.355054784355161e-06, 'clip_ratio/high_max': 1.7420219137420645e-05, 'clip_ratio/region_mean': 2.522175350350153e-05, 'epoch': 0.08}
+
+  9%|▊         | 89/1024 [3:49:00<41:53:39, 161.30s/it][AINFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 90/1024 [3:51:27<40:46:23, 157.16s/it][A
+                                                       [A{'loss': 0.0719, 'grad_norm': 0.004733253736048937, 'learning_rate': 1e-05, 'num_tokens': 70262771.0, 'completions/mean_length': 5590.71875, 'completions/min_length': 382.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5505.732421875, 'completions/min_terminated_length': 382.0, 'completions/max_terminated_length': 16219.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.26933354139328003, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019460031762719154, 'sampling/sampling_logp_difference/max': 11.303396224975586, 'sampling/importance_sampling_ratio/min': 1.233097464137245e-05, 'sampling/importance_sampling_ratio/mean': 0.9999312162399292, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9286820441484451, 'clip_ratio/low_mean': 1.8629728629093734e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0234394924045773e-06, 'clip_ratio/high_max': 8.09375796961831e-06, 'clip_ratio/region_mean': 2.0653167894124635e-05, 'epoch': 0.08}
+
+  9%|▉         | 90/1024 [3:51:27<40:46:23, 157.16s/it][AINFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 91/1024 [3:54:03<40:36:27, 156.69s/it][A
+                                                       [A{'loss': 0.0223, 'grad_norm': 0.00468763243407011, 'learning_rate': 1e-05, 'num_tokens': 71079953.0, 'completions/mean_length': 6182.484375, 'completions/min_length': 319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6102.1572265625, 'completions/min_terminated_length': 319.0, 'completions/max_terminated_length': 15879.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.26933354139328003, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02069907821714878, 'sampling/sampling_logp_difference/max': 9.24995231628418, 'sampling/importance_sampling_ratio/min': 9.611623681848869e-05, 'sampling/importance_sampling_ratio/mean': 1.0000090599060059, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0872880518436432, 'clip_ratio/low_mean': 2.489819087259093e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.592780669554486e-06, 'clip_ratio/high_max': 1.8371122678217944e-05, 'clip_ratio/region_mean': 2.949097142845858e-05, 'epoch': 0.08}
+
+  9%|▉         | 91/1024 [3:54:03<40:36:27, 156.69s/it][AINFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 92/1024 [3:56:35<40:09:41, 155.13s/it][A
+                                                       [A{'loss': 0.0642, 'grad_norm': 0.0034273737110197544, 'learning_rate': 1e-05, 'num_tokens': 71856574.0, 'completions/mean_length': 5909.2265625, 'completions/min_length': 433.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5826.748046875, 'completions/min_terminated_length': 433.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019539739936590195, 'sampling/sampling_logp_difference/max': 8.687297821044922, 'sampling/importance_sampling_ratio/min': 0.00016871529805939645, 'sampling/importance_sampling_ratio/mean': 0.9998411536216736, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9488153457641602, 'clip_ratio/low_mean': 2.6412633246764017e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.633066396309005e-06, 'clip_ratio/high_max': 1.579416039021453e-05, 'clip_ratio/region_mean': 3.1045699415699346e-05, 'epoch': 0.08}
+
+  9%|▉         | 92/1024 [3:56:35<40:09:41, 155.13s/it][AINFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 93/1024 [3:59:10<40:07:39, 155.17s/it][A
+                                                       [A{'loss': 0.0314, 'grad_norm': 0.003149663796648383, 'learning_rate': 1e-05, 'num_tokens': 72696806.0, 'completions/mean_length': 6381.3125, 'completions/min_length': 58.0, 'completions/max_length': 15933.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6381.3125, 'completions/min_terminated_length': 58.0, 'completions/max_terminated_length': 15933.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021038895472884178, 'sampling/sampling_logp_difference/max': 7.997447967529297, 'sampling/importance_sampling_ratio/min': 0.00033631984842941165, 'sampling/importance_sampling_ratio/mean': 0.999916136264801, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9708949401974678, 'clip_ratio/low_mean': 4.2946558664880286e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.362454420624999e-07, 'clip_ratio/high_max': 3.7449817682499997e-06, 'clip_ratio/region_mean': 4.388280387956911e-05, 'epoch': 0.09}
+
+  9%|▉         | 93/1024 [3:59:10<40:07:39, 155.17s/it][AINFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 94/1024 [4:01:44<40:00:37, 154.88s/it][A
+                                                       [A{'loss': 0.0355, 'grad_norm': 0.0029015145264565945, 'learning_rate': 1e-05, 'num_tokens': 73449210.0, 'completions/mean_length': 5726.03125, 'completions/min_length': 831.0, 'completions/max_length': 16180.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5726.03125, 'completions/min_terminated_length': 831.0, 'completions/max_terminated_length': 16180.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020026210695505142, 'sampling/sampling_logp_difference/max': 8.68747615814209, 'sampling/importance_sampling_ratio/min': 0.0001686852192506194, 'sampling/importance_sampling_ratio/mean': 0.9999687671661377, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9100239053368568, 'clip_ratio/low_mean': 4.956343445883249e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6230393384830677e-06, 'clip_ratio/high_max': 6.492157353932271e-06, 'clip_ratio/region_mean': 5.118647413837607e-05, 'epoch': 0.09}
+
+  9%|▉         | 94/1024 [4:01:44<40:00:37, 154.88s/it][AINFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 95/1024 [4:04:03<38:42:35, 150.01s/it][A
+                                                       [A{'loss': 0.0387, 'grad_norm': 0.0045582144521176815, 'learning_rate': 1e-05, 'num_tokens': 74212662.0, 'completions/mean_length': 5824.90625, 'completions/min_length': 364.0, 'completions/max_length': 15624.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5824.90625, 'completions/min_terminated_length': 364.0, 'completions/max_terminated_length': 15624.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.24777324497699738, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019039880484342575, 'sampling/sampling_logp_difference/max': 6.146263599395752, 'sampling/importance_sampling_ratio/min': 0.0021414682269096375, 'sampling/importance_sampling_ratio/mean': 1.0000125169754028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9244210943579674, 'clip_ratio/low_mean': 1.4287397789303213e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.99904036182852e-06, 'clip_ratio/high_max': 1.199616144731408e-05, 'clip_ratio/region_mean': 1.7286438151131733e-05, 'epoch': 0.09}
+
+  9%|▉         | 95/1024 [4:04:03<38:42:35, 150.01s/it][AINFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 96/1024 [4:06:34<38:48:11, 150.53s/it][A
+                                                       [A{'loss': 0.0776, 'grad_norm': 0.0040692174807190895, 'learning_rate': 1e-05, 'num_tokens': 75054003.0, 'completions/mean_length': 6432.7265625, 'completions/min_length': 199.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6274.77001953125, 'completions/min_terminated_length': 199.0, 'completions/max_terminated_length': 15600.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.35506343841552734, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019711513072252274, 'sampling/sampling_logp_difference/max': 5.194499492645264, 'sampling/importance_sampling_ratio/min': 0.005546991713345051, 'sampling/importance_sampling_ratio/mean': 0.9998587369918823, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8756264597177505, 'clip_ratio/low_mean': 4.0637585470904014e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.527106175875815e-06, 'clip_ratio/high_max': 1.010842470350326e-05, 'clip_ratio/region_mean': 4.316469153309299e-05, 'epoch': 0.09}
+
+  9%|▉         | 96/1024 [4:06:34<38:48:11, 150.53s/it][AINFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 97/1024 [4:08:58<38:15:55, 148.60s/it][A
+                                                       [A{'loss': 0.1137, 'grad_norm': 0.0035478502977639437, 'learning_rate': 1e-05, 'num_tokens': 75773194.0, 'completions/mean_length': 5474.6796875, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5388.779296875, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 14589.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.26037710905075073, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018789665773510933, 'sampling/sampling_logp_difference/max': 5.454678535461426, 'sampling/importance_sampling_ratio/min': 0.004276251420378685, 'sampling/importance_sampling_ratio/mean': 1.0000132322311401, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9279408678412437, 'clip_ratio/low_mean': 3.6582903135240485e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.6582903135240485e-05, 'epoch': 0.09}
+
+  9%|▉         | 97/1024 [4:08:58<38:15:55, 148.60s/it][AINFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 98/1024 [4:11:30<38:29:01, 149.61s/it][A
+                                                       [A{'loss': 0.0681, 'grad_norm': 0.004816337022930384, 'learning_rate': 1e-05, 'num_tokens': 76654837.0, 'completions/mean_length': 6730.2734375, 'completions/min_length': 235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6577.0400390625, 'completions/min_terminated_length': 235.0, 'completions/max_terminated_length': 15653.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.35325103998184204, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021000642329454422, 'sampling/sampling_logp_difference/max': 13.464577674865723, 'sampling/importance_sampling_ratio/min': 1.4203919818100985e-06, 'sampling/importance_sampling_ratio/mean': 1.0000003576278687, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0115349367260933, 'clip_ratio/low_mean': 4.1461861655989196e-05, 'clip_ratio/low_min': 3.5008122267754516e-06, 'clip_ratio/high_mean': 2.0568871832438163e-06, 'clip_ratio/high_max': 8.227548732975265e-06, 'clip_ratio/region_mean': 4.351874804342515e-05, 'epoch': 0.09}
+
+ 10%|▉         | 98/1024 [4:11:30<38:29:01, 149.61s/it][AINFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 99/1024 [4:13:47<37:27:34, 145.79s/it][A
+                                                       [A{'loss': -0.0188, 'grad_norm': 0.00695947976782918, 'learning_rate': 1e-05, 'num_tokens': 77287704.0, 'completions/mean_length': 4804.5859375, 'completions/min_length': 54.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4620.7861328125, 'completions/min_terminated_length': 54.0, 'completions/max_terminated_length': 14350.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.2688046097755432, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019261913374066353, 'sampling/sampling_logp_difference/max': 2.9661245346069336, 'sampling/importance_sampling_ratio/min': 0.051502522081136703, 'sampling/importance_sampling_ratio/mean': 1.000001072883606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8622925356030464, 'clip_ratio/low_mean': 2.399133984454238e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.208268930800841e-06, 'clip_ratio/high_max': 2.0833075723203365e-05, 'clip_ratio/region_mean': 2.919960945746425e-05, 'epoch': 0.09}
+
+ 10%|▉         | 99/1024 [4:13:47<37:27:34, 145.79s/it][AINFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 100/1024 [4:16:25<38:20:47, 149.40s/it][A
+                                                        [A{'loss': 0.1412, 'grad_norm': 0.0034830078948289156, 'learning_rate': 1e-05, 'num_tokens': 78054048.0, 'completions/mean_length': 5836.25, 'completions/min_length': 310.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5753.19677734375, 'completions/min_terminated_length': 310.0, 'completions/max_terminated_length': 15997.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29036492109298706, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01845550537109375, 'sampling/sampling_logp_difference/max': 12.792928695678711, 'sampling/importance_sampling_ratio/min': 2.7803641842183424e-06, 'sampling/importance_sampling_ratio/mean': 0.9999365210533142, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8808795213699341, 'clip_ratio/low_mean': 3.53349669239833e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.221566203137627e-06, 'clip_ratio/high_max': 1.2886264812550507e-05, 'clip_ratio/region_mean': 3.8556532899747253e-05, 'epoch': 0.09}
+
+ 10%|▉         | 100/1024 [4:16:25<38:20:47, 149.40s/it][AINFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 101/1024 [4:18:52<38:07:51, 148.72s/it][A
+                                                        [A{'loss': -0.0024, 'grad_norm': 0.0028610217850655317, 'learning_rate': 1e-05, 'num_tokens': 78765225.0, 'completions/mean_length': 5407.5703125, 'completions/min_length': 374.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5233.341796875, 'completions/min_terminated_length': 374.0, 'completions/max_terminated_length': 13964.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.26037710905075073, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018839653581380844, 'sampling/sampling_logp_difference/max': 9.742315292358398, 'sampling/importance_sampling_ratio/min': 5.874436828889884e-05, 'sampling/importance_sampling_ratio/mean': 0.9999171495437622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9438152015209198, 'clip_ratio/low_mean': 3.4728200375866436e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.57742361909186e-06, 'clip_ratio/high_max': 2.630969447636744e-05, 'clip_ratio/region_mean': 4.1305623994958296e-05, 'epoch': 0.09}
+
+ 10%|▉         | 101/1024 [4:18:52<38:07:51, 148.72s/it][AINFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 102/1024 [4:21:39<39:29:10, 154.18s/it][A
+                                                        [A{'loss': 0.0626, 'grad_norm': 0.004098972305655479, 'learning_rate': 1e-05, 'num_tokens': 79628691.0, 'completions/mean_length': 6591.765625, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6436.33349609375, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15780.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.26932865381240845, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02011241763830185, 'sampling/sampling_logp_difference/max': 6.386111259460449, 'sampling/importance_sampling_ratio/min': 0.001684795250184834, 'sampling/importance_sampling_ratio/mean': 0.9999697208404541, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9185260459780693, 'clip_ratio/low_mean': 3.569766681721376e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.917444360013178e-06, 'clip_ratio/high_max': 1.2485550996643724e-05, 'clip_ratio/region_mean': 3.961511060879275e-05, 'epoch': 0.09}
+
+ 10%|▉         | 102/1024 [4:21:39<39:29:10, 154.18s/it][AINFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 103/1024 [4:24:25<40:18:08, 157.53s/it][A
+                                                        [A{'loss': 0.0695, 'grad_norm': 0.003109709592536092, 'learning_rate': 1e-05, 'num_tokens': 80513135.0, 'completions/mean_length': 6762.40625, 'completions/min_length': 181.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6371.2841796875, 'completions/min_terminated_length': 181.0, 'completions/max_terminated_length': 16014.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.27274850010871887, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021886618807911873, 'sampling/sampling_logp_difference/max': 5.6049675941467285, 'sampling/importance_sampling_ratio/min': 0.0036795397754758596, 'sampling/importance_sampling_ratio/mean': 0.999967098236084, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0496173724532127, 'clip_ratio/low_mean': 2.3897301389297354e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.543192294979235e-06, 'clip_ratio/high_max': 1.017276917991694e-05, 'clip_ratio/region_mean': 2.644049368427659e-05, 'epoch': 0.09}
+
+ 10%|█         | 103/1024 [4:24:25<40:18:08, 157.53s/it][AINFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 104/1024 [4:27:18<41:27:11, 162.21s/it][A
+                                                        [A{'loss': 0.1028, 'grad_norm': 0.0050065224058926105, 'learning_rate': 1e-05, 'num_tokens': 81579941.0, 'completions/mean_length': 8151.421875, 'completions/min_length': 1052.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7528.79052734375, 'completions/min_terminated_length': 1052.0, 'completions/max_terminated_length': 15653.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.36691081523895264, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02017449401319027, 'sampling/sampling_logp_difference/max': 7.187410831451416, 'sampling/importance_sampling_ratio/min': 0.0007560441154055297, 'sampling/importance_sampling_ratio/mean': 0.9999760389328003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8989155367016792, 'clip_ratio/low_mean': 5.0279177912671e-05, 'clip_ratio/low_min': 6.849113788121031e-06, 'clip_ratio/high_mean': 2.6558238346297003e-06, 'clip_ratio/high_max': 1.0623295338518801e-05, 'clip_ratio/region_mean': 5.29350020315178e-05, 'epoch': 0.1}
+
+ 10%|█         | 104/1024 [4:27:18<41:27:11, 162.21s/it][AINFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 105/1024 [4:29:53<40:52:02, 160.09s/it][A
+                                                        [A{'loss': 0.0927, 'grad_norm': 0.00352756236679852, 'learning_rate': 1e-05, 'num_tokens': 82479474.0, 'completions/mean_length': 6871.7265625, 'completions/min_length': 1044.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6643.43212890625, 'completions/min_terminated_length': 1044.0, 'completions/max_terminated_length': 16094.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.33296146988868713, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021244853734970093, 'sampling/sampling_logp_difference/max': 3.749523162841797, 'sampling/importance_sampling_ratio/min': 0.023528963327407837, 'sampling/importance_sampling_ratio/mean': 1.000028133392334, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.006680078804493, 'clip_ratio/low_mean': 4.2927287609018094e-05, 'clip_ratio/low_min': 4.201963292871369e-06, 'clip_ratio/high_mean': 1.9156864254910033e-06, 'clip_ratio/high_max': 7.662745701964013e-06, 'clip_ratio/region_mean': 4.484297357976175e-05, 'epoch': 0.1}
+
+ 10%|█         | 105/1024 [4:29:53<40:52:02, 160.09s/it][AINFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 106/1024 [4:32:08<38:56:25, 152.71s/it][A
+                                                        [A{'loss': -0.0169, 'grad_norm': 0.002348776441067457, 'learning_rate': 1e-05, 'num_tokens': 83229071.0, 'completions/mean_length': 5705.6015625, 'completions/min_length': 802.0, 'completions/max_length': 14462.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5705.6015625, 'completions/min_terminated_length': 802.0, 'completions/max_terminated_length': 14462.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29249149560928345, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01885361783206463, 'sampling/sampling_logp_difference/max': 11.35004997253418, 'sampling/importance_sampling_ratio/min': 1.176890145870857e-05, 'sampling/importance_sampling_ratio/mean': 0.9999898672103882, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9162084609270096, 'clip_ratio/low_mean': 2.3860119426899473e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.544197733797773e-06, 'clip_ratio/high_max': 1.6621729173493804e-05, 'clip_ratio/region_mean': 2.9404316592263058e-05, 'epoch': 0.1}
+
+ 10%|█         | 106/1024 [4:32:08<38:56:25, 152.71s/it][AINFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 107/1024 [4:34:46<39:16:24, 154.18s/it][A
+                                                        [A{'loss': 0.038, 'grad_norm': 0.005057404283434153, 'learning_rate': 1e-05, 'num_tokens': 84119947.0, 'completions/mean_length': 6823.90625, 'completions/min_length': 129.0, 'completions/max_length': 16110.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6823.90625, 'completions/min_terminated_length': 129.0, 'completions/max_terminated_length': 16110.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.31246691942214966, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021600374951958656, 'sampling/sampling_logp_difference/max': 4.219791412353516, 'sampling/importance_sampling_ratio/min': 0.014701711013913155, 'sampling/importance_sampling_ratio/mean': 0.9999507665634155, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0139815732836723, 'clip_ratio/low_mean': 5.359476631383586e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.359476631383586e-05, 'epoch': 0.1}
+
+ 10%|█         | 107/1024 [4:34:46<39:16:24, 154.18s/it][AINFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 108/1024 [4:37:04<38:01:32, 149.45s/it][A
+                                                        [A{'loss': 0.0506, 'grad_norm': 0.008517255075275898, 'learning_rate': 1e-05, 'num_tokens': 84879833.0, 'completions/mean_length': 5786.859375, 'completions/min_length': 643.0, 'completions/max_length': 15516.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5786.859375, 'completions/min_terminated_length': 643.0, 'completions/max_terminated_length': 15516.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3311441242694855, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01993538998067379, 'sampling/sampling_logp_difference/max': 9.187470436096191, 'sampling/importance_sampling_ratio/min': 0.00010231334454147145, 'sampling/importance_sampling_ratio/mean': 0.9999799728393555, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0515320897102356, 'clip_ratio/low_mean': 3.813199691649061e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.10628331337648e-06, 'clip_ratio/high_max': 1.642513325350592e-05, 'clip_ratio/region_mean': 4.2238279775119736e-05, 'epoch': 0.1}
+
+ 11%|█         | 108/1024 [4:37:04<38:01:32, 149.45s/it][AINFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 109/1024 [4:39:34<37:59:46, 149.49s/it][A
+                                                        [A{'loss': 0.0214, 'grad_norm': 0.0034334585070610046, 'learning_rate': 1e-05, 'num_tokens': 85503162.0, 'completions/mean_length': 4726.2578125, 'completions/min_length': 406.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4634.46435546875, 'completions/min_terminated_length': 406.0, 'completions/max_terminated_length': 15836.0, 'rewards/accuracy_reward/mean': 0.6015625, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.6015625, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018191032111644745, 'sampling/sampling_logp_difference/max': 5.9298248291015625, 'sampling/importance_sampling_ratio/min': 0.0026589478366076946, 'sampling/importance_sampling_ratio/mean': 1.0000437498092651, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.795353539288044, 'clip_ratio/low_mean': 1.4313530300569255e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7510926682007266e-06, 'clip_ratio/high_max': 7.0043706728029065e-06, 'clip_ratio/region_mean': 1.606462308245682e-05, 'epoch': 0.1}
+
+ 11%|█         | 109/1024 [4:39:34<37:59:46, 149.49s/it][AINFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 110/1024 [4:42:27<39:46:57, 156.69s/it][A
+                                                        [A{'loss': 0.0811, 'grad_norm': 0.006242698058485985, 'learning_rate': 1e-05, 'num_tokens': 86350364.0, 'completions/mean_length': 6450.140625, 'completions/min_length': 401.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5787.8837890625, 'completions/min_terminated_length': 401.0, 'completions/max_terminated_length': 14514.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.27540695667266846, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01948007568717003, 'sampling/sampling_logp_difference/max': 8.794099807739258, 'sampling/importance_sampling_ratio/min': 0.00015162504860199988, 'sampling/importance_sampling_ratio/mean': 0.9999819993972778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8920315206050873, 'clip_ratio/low_mean': 3.989860044839588e-05, 'clip_ratio/low_min': 4.927079316985328e-06, 'clip_ratio/high_mean': 1.037309971252398e-06, 'clip_ratio/high_max': 4.149239885009592e-06, 'clip_ratio/region_mean': 4.093591041964828e-05, 'epoch': 0.1}
+
+ 11%|█         | 110/1024 [4:42:27<39:46:57, 156.69s/it][AINFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 111/1024 [4:45:07<39:57:26, 157.55s/it][A
+                                                        [A{'loss': 0.018, 'grad_norm': 0.002594202058389783, 'learning_rate': 1e-05, 'num_tokens': 87213277.0, 'completions/mean_length': 6597.9453125, 'completions/min_length': 657.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6442.611328125, 'completions/min_terminated_length': 657.0, 'completions/max_terminated_length': 15253.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3061561584472656, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02082553133368492, 'sampling/sampling_logp_difference/max': 4.905908584594727, 'sampling/importance_sampling_ratio/min': 0.007402713876217604, 'sampling/importance_sampling_ratio/mean': 0.9998740553855896, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9351271465420723, 'clip_ratio/low_mean': 2.8560575628944207e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8426849237584975e-06, 'clip_ratio/high_max': 4.065173015987966e-06, 'clip_ratio/region_mean': 3.0403260552702704e-05, 'epoch': 0.1}
+
+ 11%|█         | 111/1024 [4:45:07<39:57:26, 157.55s/it][AINFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 112/1024 [4:47:46<40:03:31, 158.13s/it][A
+                                                        [A{'loss': 0.0757, 'grad_norm': 0.002718541072681546, 'learning_rate': 1e-05, 'num_tokens': 88144530.0, 'completions/mean_length': 7109.9140625, 'completions/min_length': 881.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7036.8896484375, 'completions/min_terminated_length': 881.0, 'completions/max_terminated_length': 15955.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.26485776901245117, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01960277371108532, 'sampling/sampling_logp_difference/max': 8.36449146270752, 'sampling/importance_sampling_ratio/min': 0.0002329955023014918, 'sampling/importance_sampling_ratio/mean': 0.999973714351654, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8797949478030205, 'clip_ratio/low_mean': 4.297400278119312e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.382130201629479e-07, 'clip_ratio/high_max': 3.7528520806517918e-06, 'clip_ratio/region_mean': 4.391221568766923e-05, 'epoch': 0.1}
+
+ 11%|█         | 112/1024 [4:47:46<40:03:31, 158.13s/it][AINFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 113/1024 [4:50:43<41:23:36, 163.57s/it][A
+                                                        [A{'loss': 0.0854, 'grad_norm': 0.003097688313573599, 'learning_rate': 1e-05, 'num_tokens': 89109897.0, 'completions/mean_length': 7361.6796875, 'completions/min_length': 624.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6513.427734375, 'completions/min_terminated_length': 624.0, 'completions/max_terminated_length': 15834.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3148210048675537, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01997425965964794, 'sampling/sampling_logp_difference/max': 6.834630012512207, 'sampling/importance_sampling_ratio/min': 0.0010758653515949845, 'sampling/importance_sampling_ratio/mean': 0.9998917579650879, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9020541086792946, 'clip_ratio/low_mean': 4.423825043886609e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.813705350490636e-06, 'clip_ratio/high_max': 1.1254821401962545e-05, 'clip_ratio/region_mean': 4.7051955789356725e-05, 'epoch': 0.1}
+
+ 11%|█         | 113/1024 [4:50:43<41:23:36, 163.57s/it][AINFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 114/1024 [4:53:25<41:16:13, 163.27s/it][A
+                                                        [A{'loss': 0.0869, 'grad_norm': 0.0023438548669219017, 'learning_rate': 1e-05, 'num_tokens': 89891429.0, 'completions/mean_length': 5957.28125, 'completions/min_length': 749.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5620.935546875, 'completions/min_terminated_length': 749.0, 'completions/max_terminated_length': 15608.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3713865876197815, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018976174294948578, 'sampling/sampling_logp_difference/max': 11.706428527832031, 'sampling/importance_sampling_ratio/min': 8.2406731962692e-06, 'sampling/importance_sampling_ratio/mean': 0.9998185634613037, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8262394368648529, 'clip_ratio/low_mean': 7.228819413285237e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.248351158115838e-06, 'clip_ratio/high_max': 1.8235970401292434e-05, 'clip_ratio/region_mean': 7.753654563202872e-05, 'epoch': 0.1}
+
+ 11%|█         | 114/1024 [4:53:25<41:16:13, 163.27s/it][AINFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 115/1024 [4:55:59<40:29:03, 160.33s/it][A
+                                                        [A{'loss': 0.0411, 'grad_norm': 0.005619170609861612, 'learning_rate': 1e-05, 'num_tokens': 90600721.0, 'completions/mean_length': 5405.53125, 'completions/min_length': 230.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5142.04833984375, 'completions/min_terminated_length': 230.0, 'completions/max_terminated_length': 15509.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.40821409225463867, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01931554079055786, 'sampling/sampling_logp_difference/max': 16.351388931274414, 'sampling/importance_sampling_ratio/min': 7.91921266340978e-08, 'sampling/importance_sampling_ratio/mean': 0.9999438524246216, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9246686547994614, 'clip_ratio/low_mean': 5.1420432782833814e-05, 'clip_ratio/low_min': 6.1973228184797335e-06, 'clip_ratio/high_mean': 5.4644419833493885e-06, 'clip_ratio/high_max': 1.6280149793601595e-05, 'clip_ratio/region_mean': 5.688487522093055e-05, 'epoch': 0.11}
+
+ 11%|█         | 115/1024 [4:55:59<40:29:03, 160.33s/it][AINFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█▏        | 116/1024 [4:58:43<40:43:24, 161.46s/it][A
+                                                        [A{'loss': 0.0379, 'grad_norm': 0.006043895613402128, 'learning_rate': 1e-05, 'num_tokens': 91486063.0, 'completions/mean_length': 6754.859375, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6363.4306640625, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 16106.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2527858018875122, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02107170596718788, 'sampling/sampling_logp_difference/max': 12.875, 'sampling/importance_sampling_ratio/min': 2.5612887384340866e-06, 'sampling/importance_sampling_ratio/mean': 0.9999067783355713, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.952000230550766, 'clip_ratio/low_mean': 3.463903834699522e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.307115153143968e-06, 'clip_ratio/high_max': 9.228460612575873e-06, 'clip_ratio/region_mean': 3.694615350013919e-05, 'epoch': 0.11}
+
+ 11%|█▏        | 116/1024 [4:58:43<40:43:24, 161.46s/it][AINFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█▏        | 117/1024 [5:01:50<42:38:56, 169.28s/it][A
+                                                        [A{'loss': 0.0666, 'grad_norm': 0.00392121123149991, 'learning_rate': 1e-05, 'num_tokens': 92546920.0, 'completions/mean_length': 8135.8203125, 'completions/min_length': 649.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7869.75, 'completions/min_terminated_length': 649.0, 'completions/max_terminated_length': 16377.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2977413833141327, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02211480587720871, 'sampling/sampling_logp_difference/max': 10.189286231994629, 'sampling/importance_sampling_ratio/min': 3.757069134735502e-05, 'sampling/importance_sampling_ratio/mean': 0.9999874830245972, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0832853615283966, 'clip_ratio/low_mean': 3.14642731495951e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.65198184226756e-06, 'clip_ratio/high_max': 1.460792736907024e-05, 'clip_ratio/region_mean': 3.511625499186266e-05, 'epoch': 0.11}
+
+ 11%|█▏        | 117/1024 [5:01:50<42:38:56, 169.28s/it][AINFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 118/1024 [5:04:15<40:43:37, 161.83s/it][A
+                                                        [A{'loss': 0.0378, 'grad_norm': 0.00480870483443141, 'learning_rate': 1e-05, 'num_tokens': 93270524.0, 'completions/mean_length': 5476.53125, 'completions/min_length': 666.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5214.75244140625, 'completions/min_terminated_length': 666.0, 'completions/max_terminated_length': 15497.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3243093490600586, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01990744285285473, 'sampling/sampling_logp_difference/max': 3.5937137603759766, 'sampling/importance_sampling_ratio/min': 0.02749602682888508, 'sampling/importance_sampling_ratio/mean': 1.000068187713623, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0261689275503159, 'clip_ratio/low_mean': 3.652223790595599e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.964218977780547e-06, 'clip_ratio/high_max': 3.585687591112219e-05, 'clip_ratio/region_mean': 4.548645733848389e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 118/1024 [5:04:15<40:43:37, 161.83s/it][AINFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 119/1024 [5:07:14<41:58:40, 166.98s/it][A
+                                                        [A{'loss': 0.0792, 'grad_norm': 0.003411791054531932, 'learning_rate': 1e-05, 'num_tokens': 94271404.0, 'completions/mean_length': 7670.0625, 'completions/min_length': 964.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7165.9501953125, 'completions/min_terminated_length': 964.0, 'completions/max_terminated_length': 16209.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.28117600083351135, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01960139349102974, 'sampling/sampling_logp_difference/max': 13.061310768127441, 'sampling/importance_sampling_ratio/min': 2.125909531969228e-06, 'sampling/importance_sampling_ratio/mean': 0.999955415725708, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8719229996204376, 'clip_ratio/low_mean': 3.6732255466631614e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2816832395401434e-06, 'clip_ratio/high_max': 5.126732958160574e-06, 'clip_ratio/region_mean': 3.8013938819858595e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 119/1024 [5:07:14<41:58:40, 166.98s/it][AINFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 120/1024 [5:09:43<40:34:10, 161.56s/it][A
+                                                        [A{'loss': 0.0852, 'grad_norm': 0.0036615384742617607, 'learning_rate': 1e-05, 'num_tokens': 94998263.0, 'completions/mean_length': 5499.0859375, 'completions/min_length': 867.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5413.3779296875, 'completions/min_terminated_length': 867.0, 'completions/max_terminated_length': 15284.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.27776598930358887, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01831059902906418, 'sampling/sampling_logp_difference/max': 8.126622200012207, 'sampling/importance_sampling_ratio/min': 0.00029556488152593374, 'sampling/importance_sampling_ratio/mean': 0.9999586939811707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8891193494200706, 'clip_ratio/low_mean': 3.3884271260831156e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0189622685174982e-05, 'clip_ratio/high_max': 3.2011115308705484e-05, 'clip_ratio/region_mean': 4.4073893604945624e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 120/1024 [5:09:43<40:34:10, 161.56s/it][AINFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 121/1024 [5:12:30<40:58:04, 163.33s/it][A
+                                                        [A{'loss': 0.0704, 'grad_norm': 0.003688640194013715, 'learning_rate': 1e-05, 'num_tokens': 96020572.0, 'completions/mean_length': 7831.1015625, 'completions/min_length': 855.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7410.466796875, 'completions/min_terminated_length': 855.0, 'completions/max_terminated_length': 15605.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.266974538564682, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020766064524650574, 'sampling/sampling_logp_difference/max': 7.095963478088379, 'sampling/importance_sampling_ratio/min': 0.0008284422219730914, 'sampling/importance_sampling_ratio/mean': 1.0000081062316895, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9511109218001366, 'clip_ratio/low_mean': 3.4662164466681133e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.505237830519036e-06, 'clip_ratio/high_max': 1.0020951322076144e-05, 'clip_ratio/region_mean': 3.716740218351333e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 121/1024 [5:12:30<40:58:04, 163.33s/it][AINFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 122/1024 [5:15:40<42:56:43, 171.40s/it][A
+                                                        [A{'loss': 0.0796, 'grad_norm': 0.002527788048610091, 'learning_rate': 1e-05, 'num_tokens': 97055892.0, 'completions/mean_length': 7928.5, 'completions/min_length': 289.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7584.7802734375, 'completions/min_terminated_length': 289.0, 'completions/max_terminated_length': 16267.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.22567617893218994, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02571871504187584, 'sampling/sampling_logp_difference/max': 11.72396469116211, 'sampling/importance_sampling_ratio/min': 8.097423233266454e-06, 'sampling/importance_sampling_ratio/mean': 0.999517560005188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.053833745419979, 'clip_ratio/low_mean': 4.2512260733929e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0797083405122976e-06, 'clip_ratio/high_max': 4.31883336204919e-06, 'clip_ratio/region_mean': 4.359196918812813e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 122/1024 [5:15:40<42:56:43, 171.40s/it][AINFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 123/1024 [5:18:20<42:01:35, 167.92s/it][A
+                                                        [A{'loss': 0.0524, 'grad_norm': 0.004057250916957855, 'learning_rate': 1e-05, 'num_tokens': 98026604.0, 'completions/mean_length': 7433.0, 'completions/min_length': 1112.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7218.17626953125, 'completions/min_terminated_length': 1112.0, 'completions/max_terminated_length': 15282.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.30274903774261475, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020892417058348656, 'sampling/sampling_logp_difference/max': 5.936958312988281, 'sampling/importance_sampling_ratio/min': 0.0026400478091090918, 'sampling/importance_sampling_ratio/mean': 0.9999719858169556, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0001763850450516, 'clip_ratio/low_mean': 5.3688914704252966e-05, 'clip_ratio/low_min': 1.0726187383625074e-05, 'clip_ratio/high_mean': 5.360034492696286e-06, 'clip_ratio/high_max': 2.1440137970785145e-05, 'clip_ratio/region_mean': 5.904894931063609e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 123/1024 [5:18:20<42:01:35, 167.92s/it][AINFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 124/1024 [5:21:03<41:35:03, 166.34s/it][A
+                                                        [A{'loss': 0.0561, 'grad_norm': 0.004367270041257143, 'learning_rate': 1e-05, 'num_tokens': 98882667.0, 'completions/mean_length': 6529.8046875, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6211.92724609375, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15435.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020555414259433746, 'sampling/sampling_logp_difference/max': 8.874999046325684, 'sampling/importance_sampling_ratio/min': 0.00013984176621306688, 'sampling/importance_sampling_ratio/mean': 0.9999692440032959, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0204281583428383, 'clip_ratio/low_mean': 3.0267089357494115e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8002238562075945e-06, 'clip_ratio/high_max': 7.200895424830378e-06, 'clip_ratio/region_mean': 3.206731355476222e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 124/1024 [5:21:03<41:35:03, 166.34s/it][AINFO 12-01 18:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:46:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 125/1024 [5:24:00<42:22:12, 169.67s/it][A
+                                                        [A{'loss': 0.027, 'grad_norm': 0.0014496444491669536, 'learning_rate': 1e-05, 'num_tokens': 99847384.0, 'completions/mean_length': 7329.9140625, 'completions/min_length': 525.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6806.12353515625, 'completions/min_terminated_length': 525.0, 'completions/max_terminated_length': 15737.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019216356799006462, 'sampling/sampling_logp_difference/max': 10.749985694885254, 'sampling/importance_sampling_ratio/min': 2.1445715901791118e-05, 'sampling/importance_sampling_ratio/mean': 0.9999719262123108, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8461082950234413, 'clip_ratio/low_mean': 3.819216192368913e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.958261901170772e-07, 'clip_ratio/high_max': 3.583304760468309e-06, 'clip_ratio/region_mean': 3.908798782958911e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 125/1024 [5:24:00<42:22:12, 169.67s/it][AINFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 126/1024 [5:26:42<41:42:17, 167.19s/it][A
+                                                        [A{'loss': 0.0475, 'grad_norm': 0.006009541917592287, 'learning_rate': 1e-05, 'num_tokens': 100699437.0, 'completions/mean_length': 6518.4765625, 'completions/min_length': 969.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6200.23388671875, 'completions/min_terminated_length': 969.0, 'completions/max_terminated_length': 15200.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01985173299908638, 'sampling/sampling_logp_difference/max': 9.606365203857422, 'sampling/importance_sampling_ratio/min': 6.729899905622005e-05, 'sampling/importance_sampling_ratio/mean': 0.9999701976776123, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.880072832107544, 'clip_ratio/low_mean': 3.4717084645308205e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.679183808140806e-06, 'clip_ratio/high_max': 1.0716735232563224e-05, 'clip_ratio/region_mean': 3.7396268680822686e-05, 'epoch': 0.12}
+
+ 12%|█▏        | 126/1024 [5:26:42<41:42:17, 167.19s/it][AINFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 127/1024 [5:29:38<42:19:43, 169.88s/it][A
+                                                        [A{'loss': 0.0562, 'grad_norm': 0.00254544778726995, 'learning_rate': 1e-05, 'num_tokens': 101797124.0, 'completions/mean_length': 8421.9296875, 'completions/min_length': 1180.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8030.35205078125, 'completions/min_terminated_length': 1180.0, 'completions/max_terminated_length': 16379.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2603819966316223, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020804740488529205, 'sampling/sampling_logp_difference/max': 10.75251579284668, 'sampling/importance_sampling_ratio/min': 2.139152456948068e-05, 'sampling/importance_sampling_ratio/mean': 0.9999698400497437, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.929582305252552, 'clip_ratio/low_mean': 3.8401355027417594e-05, 'clip_ratio/low_min': 3.4494178180466406e-06, 'clip_ratio/high_mean': 1.8907661001321685e-06, 'clip_ratio/high_max': 7.563064400528674e-06, 'clip_ratio/region_mean': 4.029212129808002e-05, 'epoch': 0.12}
+
+ 12%|█▏        | 127/1024 [5:29:38<42:19:43, 169.88s/it][AINFO 12-01 18:54:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:54:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:54:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:54:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▎        | 128/1024 [5:32:08<40:48:49, 163.98s/it][A
+                                                        [A{'loss': -0.0048, 'grad_norm': 0.0030309113208204508, 'learning_rate': 1e-05, 'num_tokens': 102643751.0, 'completions/mean_length': 6452.5859375, 'completions/min_length': 233.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6214.232421875, 'completions/min_terminated_length': 233.0, 'completions/max_terminated_length': 14871.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3453505039215088, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02046305686235428, 'sampling/sampling_logp_difference/max': 10.81167221069336, 'sampling/importance_sampling_ratio/min': 2.0162780856480822e-05, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9258717745542526, 'clip_ratio/low_mean': 3.5734614471039094e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.125810965480923e-06, 'clip_ratio/high_max': 8.503243861923693e-06, 'clip_ratio/region_mean': 3.7860425095459505e-05, 'epoch': 0.12}
+
+ 12%|█▎        | 128/1024 [5:32:08<40:48:49, 163.98s/it][AINFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 129/1024 [5:35:10<42:05:23, 169.30s/it][A
+                                                        [A{'loss': 0.0525, 'grad_norm': 0.0028038588352501392, 'learning_rate': 1e-05, 'num_tokens': 103645849.0, 'completions/mean_length': 7655.140625, 'completions/min_length': 1095.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7373.564453125, 'completions/min_terminated_length': 1095.0, 'completions/max_terminated_length': 16323.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.24435339868068695, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022147968411445618, 'sampling/sampling_logp_difference/max': 3.781249523162842, 'sampling/importance_sampling_ratio/min': 0.022794192656874657, 'sampling/importance_sampling_ratio/mean': 0.9999130964279175, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1112212240695953, 'clip_ratio/low_mean': 2.8848363626821083e-05, 'clip_ratio/low_min': 3.2798930078570265e-06, 'clip_ratio/high_mean': 4.865382209118252e-06, 'clip_ratio/high_max': 1.4670421251139487e-05, 'clip_ratio/region_mean': 3.371374566540908e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 129/1024 [5:35:10<42:05:23, 169.30s/it][AINFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 130/1024 [5:38:21<43:39:32, 175.81s/it][A
+                                                        [A{'loss': 0.0942, 'grad_norm': 0.003990175202488899, 'learning_rate': 1e-05, 'num_tokens': 104712987.0, 'completions/mean_length': 8166.765625, 'completions/min_length': 838.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7618.9501953125, 'completions/min_terminated_length': 838.0, 'completions/max_terminated_length': 15694.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2680353820323944, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019254228100180626, 'sampling/sampling_logp_difference/max': 10.624967575073242, 'sampling/importance_sampling_ratio/min': 2.430162021482829e-05, 'sampling/importance_sampling_ratio/mean': 0.9999572038650513, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8589507639408112, 'clip_ratio/low_mean': 2.8828401809732895e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.8828401809732895e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 130/1024 [5:38:21<43:39:32, 175.81s/it][AINFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 131/1024 [5:40:55<41:59:10, 169.26s/it][A
+                                                        [A{'loss': 0.0481, 'grad_norm': 0.0038855294696986675, 'learning_rate': 1e-05, 'num_tokens': 105481743.0, 'completions/mean_length': 5872.40625, 'completions/min_length': 352.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5789.6376953125, 'completions/min_terminated_length': 352.0, 'completions/max_terminated_length': 15444.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3527044355869293, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021132031455636024, 'sampling/sampling_logp_difference/max': 6.312424659729004, 'sampling/importance_sampling_ratio/min': 0.0018136304570361972, 'sampling/importance_sampling_ratio/mean': 0.9999517202377319, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0606305003166199, 'clip_ratio/low_mean': 3.547307028384239e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9298730080663518e-06, 'clip_ratio/high_max': 7.719492032265407e-06, 'clip_ratio/region_mean': 3.7402943462439e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 131/1024 [5:40:55<41:59:10, 169.26s/it][AINFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 132/1024 [5:43:33<41:07:51, 166.00s/it][A
+                                                        [A{'loss': 0.0487, 'grad_norm': 0.004712321795523167, 'learning_rate': 1e-05, 'num_tokens': 106333695.0, 'completions/mean_length': 6474.9375, 'completions/min_length': 194.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6237.1201171875, 'completions/min_terminated_length': 194.0, 'completions/max_terminated_length': 15742.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3874102830886841, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019161570817232132, 'sampling/sampling_logp_difference/max': 10.098255157470703, 'sampling/importance_sampling_ratio/min': 4.115129559068009e-05, 'sampling/importance_sampling_ratio/mean': 0.9999421834945679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8699874132871628, 'clip_ratio/low_mean': 4.114894863960217e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.614050223812228e-06, 'clip_ratio/high_max': 1.6221786609094124e-05, 'clip_ratio/region_mean': 4.6762998408667045e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 132/1024 [5:43:33<41:07:51, 166.00s/it][AINFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 133/1024 [5:46:14<40:43:52, 164.57s/it][A
+                                                        [A{'loss': 0.0574, 'grad_norm': 0.0031310587655752897, 'learning_rate': 1e-05, 'num_tokens': 107236363.0, 'completions/mean_length': 6910.03125, 'completions/min_length': 1212.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6604.4189453125, 'completions/min_terminated_length': 1212.0, 'completions/max_terminated_length': 15841.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019823957234621048, 'sampling/sampling_logp_difference/max': 6.661808490753174, 'sampling/importance_sampling_ratio/min': 0.0012788315070793033, 'sampling/importance_sampling_ratio/mean': 1.0000447034835815, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8597542196512222, 'clip_ratio/low_mean': 2.881602637216929e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.188186724401021e-06, 'clip_ratio/high_max': 1.2752746897604084e-05, 'clip_ratio/region_mean': 3.200421309657031e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 133/1024 [5:46:14<40:43:52, 164.57s/it][AINFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 134/1024 [5:48:58<40:38:06, 164.37s/it][A
+                                                        [A{'loss': -0.0101, 'grad_norm': 0.006233204621821642, 'learning_rate': 1e-05, 'num_tokens': 108044714.0, 'completions/mean_length': 6172.7421875, 'completions/min_length': 691.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5843.3466796875, 'completions/min_terminated_length': 691.0, 'completions/max_terminated_length': 15311.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020428352057933807, 'sampling/sampling_logp_difference/max': 6.656150817871094, 'sampling/importance_sampling_ratio/min': 0.0012860872084274888, 'sampling/importance_sampling_ratio/mean': 0.9999743700027466, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9560965895652771, 'clip_ratio/low_mean': 3.179941927555774e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.02184224665325e-06, 'clip_ratio/high_max': 1.2087368986613e-05, 'clip_ratio/region_mean': 3.482126135168073e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 134/1024 [5:48:58<40:38:06, 164.37s/it][AINFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 135/1024 [5:51:30<39:38:11, 160.51s/it][A
+                                                        [A{'loss': 0.1106, 'grad_norm': 0.005762661807239056, 'learning_rate': 1e-05, 'num_tokens': 108862901.0, 'completions/mean_length': 6232.4609375, 'completions/min_length': 276.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5988.82421875, 'completions/min_terminated_length': 276.0, 'completions/max_terminated_length': 15737.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3748064339160919, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01772497221827507, 'sampling/sampling_logp_difference/max': 5.4041595458984375, 'sampling/importance_sampling_ratio/min': 0.004497833084315062, 'sampling/importance_sampling_ratio/mean': 0.9999505877494812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.792289063334465, 'clip_ratio/low_mean': 3.8776780229454744e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.616570095095085e-06, 'clip_ratio/high_max': 1.846628038038034e-05, 'clip_ratio/region_mean': 4.339335077929718e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 135/1024 [5:51:30<39:38:11, 160.51s/it][AINFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 136/1024 [5:54:11<39:36:35, 160.58s/it][A
+                                                        [A{'loss': 0.088, 'grad_norm': 0.002916123950853944, 'learning_rate': 1e-05, 'num_tokens': 109544058.0, 'completions/mean_length': 5181.1015625, 'completions/min_length': 695.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5003.27783203125, 'completions/min_terminated_length': 695.0, 'completions/max_terminated_length': 15440.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.3327339291572571, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017177307978272438, 'sampling/sampling_logp_difference/max': 14.749001502990723, 'sampling/importance_sampling_ratio/min': 3.9317873756772315e-07, 'sampling/importance_sampling_ratio/mean': 0.999925971031189, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7691714614629745, 'clip_ratio/low_mean': 3.377504378931917e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.782972615023027e-06, 'clip_ratio/high_max': 1.1131890460092109e-05, 'clip_ratio/region_mean': 3.65580164043422e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 136/1024 [5:54:11<39:36:35, 160.58s/it][AINFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 137/1024 [5:56:54<39:47:45, 161.52s/it][A
+                                                        [A{'loss': 0.0303, 'grad_norm': 0.0035183338914066553, 'learning_rate': 1e-05, 'num_tokens': 110282853.0, 'completions/mean_length': 5583.5859375, 'completions/min_length': 537.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5235.185546875, 'completions/min_terminated_length': 537.0, 'completions/max_terminated_length': 15288.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.24381661415100098, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01895858161151409, 'sampling/sampling_logp_difference/max': 6.156238079071045, 'sampling/importance_sampling_ratio/min': 0.0021202145144343376, 'sampling/importance_sampling_ratio/mean': 0.9999736547470093, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.922084204852581, 'clip_ratio/low_mean': 3.033036318811355e-05, 'clip_ratio/low_min': 3.5457974263408687e-06, 'clip_ratio/high_mean': 5.5457699090766255e-06, 'clip_ratio/high_max': 2.2183079636306502e-05, 'clip_ratio/region_mean': 3.587613309719018e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 137/1024 [5:56:54<39:47:45, 161.52s/it][AINFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 138/1024 [6:00:06<42:00:12, 170.67s/it][A
+                                                        [A{'loss': 0.0418, 'grad_norm': 0.002201368333771825, 'learning_rate': 1e-05, 'num_tokens': 111228449.0, 'completions/mean_length': 7191.71875, 'completions/min_length': 461.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6659.93359375, 'completions/min_terminated_length': 461.0, 'completions/max_terminated_length': 16255.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01915489323437214, 'sampling/sampling_logp_difference/max': 5.343695163726807, 'sampling/importance_sampling_ratio/min': 0.0047781821340322495, 'sampling/importance_sampling_ratio/mean': 0.9998859167098999, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8676051273941994, 'clip_ratio/low_mean': 2.520359919344628e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.783892558814841e-07, 'clip_ratio/high_max': 2.7135570235259365e-06, 'clip_ratio/region_mean': 2.588198810826725e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 138/1024 [6:00:06<42:00:12, 170.67s/it][AINFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▎        | 139/1024 [6:02:49<41:20:59, 168.20s/it][A
+                                                        [A{'loss': 0.0979, 'grad_norm': 0.00720562506467104, 'learning_rate': 1e-05, 'num_tokens': 111904700.0, 'completions/mean_length': 5139.5859375, 'completions/min_length': 498.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4869.72021484375, 'completions/min_terminated_length': 498.0, 'completions/max_terminated_length': 16102.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3566659688949585, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.016763046383857727, 'sampling/sampling_logp_difference/max': 11.616515159606934, 'sampling/importance_sampling_ratio/min': 9.015951036417391e-06, 'sampling/importance_sampling_ratio/mean': 0.9999786615371704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7077975794672966, 'clip_ratio/low_mean': 4.164742210832628e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.982446049936698e-06, 'clip_ratio/high_max': 2.2828588043921627e-05, 'clip_ratio/region_mean': 4.962986872669717e-05, 'epoch': 0.13}
+
+ 14%|█▎        | 139/1024 [6:02:49<41:20:59, 168.20s/it][AINFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▎        | 140/1024 [6:05:55<42:39:47, 173.74s/it][A
+                                                        [A{'loss': 0.0855, 'grad_norm': 0.005594039335846901, 'learning_rate': 1e-05, 'num_tokens': 112873218.0, 'completions/mean_length': 7408.296875, 'completions/min_length': 678.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7118.7578125, 'completions/min_terminated_length': 678.0, 'completions/max_terminated_length': 15887.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2806568741798401, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018874341621994972, 'sampling/sampling_logp_difference/max': 9.749542236328125, 'sampling/importance_sampling_ratio/min': 5.832135502714664e-05, 'sampling/importance_sampling_ratio/mean': 0.9999697804450989, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8338208198547363, 'clip_ratio/low_mean': 5.0197708333143964e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.257615276197612e-06, 'clip_ratio/high_max': 1.3030461104790447e-05, 'clip_ratio/region_mean': 5.345532326828106e-05, 'epoch': 0.13}
+
+ 14%|█▎        | 140/1024 [6:05:55<42:39:47, 173.74s/it][AINFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 141/1024 [6:09:01<43:28:25, 177.24s/it][A
+                                                        [A{'loss': 0.089, 'grad_norm': 0.0025491444393992424, 'learning_rate': 1e-05, 'num_tokens': 113869418.0, 'completions/mean_length': 7637.25, 'completions/min_length': 943.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7131.2392578125, 'completions/min_terminated_length': 943.0, 'completions/max_terminated_length': 16158.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.32641828060150146, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020018339157104492, 'sampling/sampling_logp_difference/max': 14.212298393249512, 'sampling/importance_sampling_ratio/min': 6.724766876686772e-07, 'sampling/importance_sampling_ratio/mean': 0.9999139308929443, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9943022206425667, 'clip_ratio/low_mean': 3.066379792926455e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.586851668544114e-07, 'clip_ratio/high_max': 2.6347406674176455e-06, 'clip_ratio/region_mean': 3.132248309611896e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 141/1024 [6:09:01<43:28:25, 177.24s/it][AINFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 142/1024 [6:11:42<42:15:06, 172.46s/it][A
+                                                        [A{'loss': 0.1115, 'grad_norm': 0.003907687962055206, 'learning_rate': 1e-05, 'num_tokens': 114674257.0, 'completions/mean_length': 6144.8671875, 'completions/min_length': 1000.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6064.244140625, 'completions/min_terminated_length': 1000.0, 'completions/max_terminated_length': 16199.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.287486732006073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018300339579582214, 'sampling/sampling_logp_difference/max': 5.673813343048096, 'sampling/importance_sampling_ratio/min': 0.003434742335230112, 'sampling/importance_sampling_ratio/mean': 0.9999485611915588, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9252935722470284, 'clip_ratio/low_mean': 2.370427267806008e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.372918283479521e-06, 'clip_ratio/high_max': 1.7491673133918084e-05, 'clip_ratio/region_mean': 2.8077190734165924e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 142/1024 [6:11:42<42:15:06, 172.46s/it][AINFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 143/1024 [6:14:09<40:21:06, 164.89s/it][A
+                                                        [A{'loss': 0.023, 'grad_norm': 0.0042014638893306255, 'learning_rate': 1e-05, 'num_tokens': 115496300.0, 'completions/mean_length': 6266.6484375, 'completions/min_length': 919.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6186.984375, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 15768.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.18884867429733276, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021998615935444832, 'sampling/sampling_logp_difference/max': 12.561980247497559, 'sampling/importance_sampling_ratio/min': 3.502686922729481e-06, 'sampling/importance_sampling_ratio/mean': 0.9999801516532898, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0926234126091003, 'clip_ratio/low_mean': 2.688816772433711e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0681611658801557e-06, 'clip_ratio/high_max': 8.272644663520623e-06, 'clip_ratio/region_mean': 2.8956328833373846e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 143/1024 [6:14:09<40:21:06, 164.89s/it][AINFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 144/1024 [6:16:41<39:20:34, 160.95s/it][A
+                                                        [A{'loss': 0.0404, 'grad_norm': 0.0028757627587765455, 'learning_rate': 1e-05, 'num_tokens': 116333286.0, 'completions/mean_length': 6392.890625, 'completions/min_length': 559.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6234.3017578125, 'completions/min_terminated_length': 559.0, 'completions/max_terminated_length': 15504.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.35665616393089294, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019069479778409004, 'sampling/sampling_logp_difference/max': 15.27328872680664, 'sampling/importance_sampling_ratio/min': 2.327528392243039e-07, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9028401970863342, 'clip_ratio/low_mean': 4.51459295618406e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.860460075586161e-06, 'clip_ratio/high_max': 2.7441840302344644e-05, 'clip_ratio/region_mean': 5.200638997848728e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 144/1024 [6:16:41<39:20:34, 160.95s/it][AINFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 145/1024 [6:19:13<38:38:38, 158.27s/it][A
+                                                        [A{'loss': 0.0858, 'grad_norm': 0.006776242982596159, 'learning_rate': 1e-05, 'num_tokens': 117158619.0, 'completions/mean_length': 6300.1640625, 'completions/min_length': 73.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6220.763671875, 'completions/min_terminated_length': 73.0, 'completions/max_terminated_length': 16183.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.29826053977012634, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022119753062725067, 'sampling/sampling_logp_difference/max': 14.249761581420898, 'sampling/importance_sampling_ratio/min': 6.477496299339691e-07, 'sampling/importance_sampling_ratio/mean': 0.9998651742935181, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.110174722969532, 'clip_ratio/low_mean': 3.626850991622632e-05, 'clip_ratio/low_min': 4.492201696848497e-06, 'clip_ratio/high_mean': 3.0424674832829623e-06, 'clip_ratio/high_max': 1.216986993313185e-05, 'clip_ratio/region_mean': 3.931097762688296e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 145/1024 [6:19:13<38:38:38, 158.27s/it][AINFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 146/1024 [6:21:58<39:06:40, 160.37s/it][A
+                                                        [A{'loss': 0.0041, 'grad_norm': 0.003441061358898878, 'learning_rate': 1e-05, 'num_tokens': 118140579.0, 'completions/mean_length': 7482.25, 'completions/min_length': 169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7340.95263671875, 'completions/min_terminated_length': 169.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.2109375, 'rewards/accuracy_reward/std': 0.4095771610736847, 'reward': 0.2109375, 'reward_std': 0.23250605165958405, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020916422829031944, 'sampling/sampling_logp_difference/max': 11.356839179992676, 'sampling/importance_sampling_ratio/min': 1.1689271559589542e-05, 'sampling/importance_sampling_ratio/mean': 0.9999172687530518, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9957183450460434, 'clip_ratio/low_mean': 1.452984838579141e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.926812046804116e-06, 'clip_ratio/high_max': 7.707248187216464e-06, 'clip_ratio/region_mean': 1.6456660432595527e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 146/1024 [6:21:58<39:06:40, 160.37s/it][AINFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 147/1024 [6:24:37<38:57:01, 159.89s/it][A
+                                                        [A{'loss': 0.0601, 'grad_norm': 0.0035624606534838676, 'learning_rate': 1e-05, 'num_tokens': 118982515.0, 'completions/mean_length': 6411.125, 'completions/min_length': 415.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6252.82568359375, 'completions/min_terminated_length': 415.0, 'completions/max_terminated_length': 16193.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3913620114326477, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020998675376176834, 'sampling/sampling_logp_difference/max': 3.96539044380188, 'sampling/importance_sampling_ratio/min': 0.018960632383823395, 'sampling/importance_sampling_ratio/mean': 0.9999991655349731, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9852773621678352, 'clip_ratio/low_mean': 4.652173765862244e-05, 'clip_ratio/low_min': 8.251542112702737e-06, 'clip_ratio/high_mean': 3.4127203889511293e-06, 'clip_ratio/high_max': 1.3650881555804517e-05, 'clip_ratio/region_mean': 4.993445759282622e-05, 'epoch': 0.14}
+
+ 14%|█▍        | 147/1024 [6:24:37<38:57:01, 159.89s/it][AINFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 148/1024 [6:27:38<40:27:19, 166.26s/it][A
+                                                        [A{'loss': 0.0756, 'grad_norm': 0.004949269350618124, 'learning_rate': 1e-05, 'num_tokens': 119851003.0, 'completions/mean_length': 6640.75, 'completions/min_length': 1204.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6326.45166015625, 'completions/min_terminated_length': 1204.0, 'completions/max_terminated_length': 15146.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01905224658548832, 'sampling/sampling_logp_difference/max': 9.749635696411133, 'sampling/importance_sampling_ratio/min': 5.8315905334893614e-05, 'sampling/importance_sampling_ratio/mean': 0.9999769926071167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8645239844918251, 'clip_ratio/low_mean': 2.3662243620492518e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.276765594113385e-06, 'clip_ratio/high_max': 1.710706237645354e-05, 'clip_ratio/region_mean': 2.7939009100919066e-05, 'epoch': 0.14}
+
+ 14%|█▍        | 148/1024 [6:27:38<40:27:19, 166.26s/it][AINFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 149/1024 [6:30:31<40:54:55, 168.34s/it][A
+                                                        [A{'loss': 0.1008, 'grad_norm': 0.005622676108032465, 'learning_rate': 1e-05, 'num_tokens': 120765165.0, 'completions/mean_length': 6987.953125, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6444.3798828125, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 16061.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.39796435832977295, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01969297230243683, 'sampling/sampling_logp_difference/max': 9.292106628417969, 'sampling/importance_sampling_ratio/min': 9.214873716700822e-05, 'sampling/importance_sampling_ratio/mean': 0.9999727010726929, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9469119384884834, 'clip_ratio/low_mean': 5.667686264132499e-05, 'clip_ratio/low_min': 3.2221478249994107e-06, 'clip_ratio/high_mean': 2.0922732346662087e-06, 'clip_ratio/high_max': 5.033624802308623e-06, 'clip_ratio/region_mean': 5.876913564861752e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 149/1024 [6:30:31<40:54:55, 168.34s/it][AINFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 150/1024 [6:33:34<41:54:35, 172.63s/it][A
+                                                        [A{'loss': -0.0093, 'grad_norm': 0.0035846447572112083, 'learning_rate': 1e-05, 'num_tokens': 121749426.0, 'completions/mean_length': 7539.2265625, 'completions/min_length': 103.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6949.5751953125, 'completions/min_terminated_length': 103.0, 'completions/max_terminated_length': 16218.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.22461043298244476, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02050059661269188, 'sampling/sampling_logp_difference/max': 11.749993324279785, 'sampling/importance_sampling_ratio/min': 7.889377229730599e-06, 'sampling/importance_sampling_ratio/mean': 1.0000232458114624, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.983614593744278, 'clip_ratio/low_mean': 3.030186894648068e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8565209529697313e-06, 'clip_ratio/high_max': 4.223829364491394e-06, 'clip_ratio/region_mean': 3.21583895583899e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 150/1024 [6:33:34<41:54:35, 172.63s/it][AINFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 151/1024 [6:36:14<40:54:26, 168.69s/it][A
+                                                        [A{'loss': 0.0479, 'grad_norm': 0.005333681590855122, 'learning_rate': 1e-05, 'num_tokens': 122579975.0, 'completions/mean_length': 6339.5390625, 'completions/min_length': 363.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5845.548828125, 'completions/min_terminated_length': 363.0, 'completions/max_terminated_length': 15528.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.327729195356369, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019756250083446503, 'sampling/sampling_logp_difference/max': 6.091750144958496, 'sampling/importance_sampling_ratio/min': 0.0022614477202296257, 'sampling/importance_sampling_ratio/mean': 0.9999289512634277, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9051830619573593, 'clip_ratio/low_mean': 4.44662659901951e-05, 'clip_ratio/low_min': 5.9182802942814305e-06, 'clip_ratio/high_mean': 2.6333877940487582e-06, 'clip_ratio/high_max': 1.0533551176195033e-05, 'clip_ratio/region_mean': 4.7099654238991207e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 151/1024 [6:36:14<40:54:26, 168.69s/it][AINFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 152/1024 [6:38:49<39:54:24, 164.75s/it][A
+                                                        [A{'loss': 0.1029, 'grad_norm': 0.005628545768558979, 'learning_rate': 1e-05, 'num_tokens': 123444686.0, 'completions/mean_length': 6610.8046875, 'completions/min_length': 856.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6533.8505859375, 'completions/min_terminated_length': 856.0, 'completions/max_terminated_length': 15321.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3498311936855316, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019961554557085037, 'sampling/sampling_logp_difference/max': 5.890087127685547, 'sampling/importance_sampling_ratio/min': 0.0027667356189340353, 'sampling/importance_sampling_ratio/mean': 0.9999935030937195, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9121239259839058, 'clip_ratio/low_mean': 5.054293433204293e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4903662304277532e-06, 'clip_ratio/high_max': 5.961464921711013e-06, 'clip_ratio/region_mean': 5.2033300562470686e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 152/1024 [6:38:49<39:54:24, 164.75s/it][AINFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 153/1024 [6:41:40<40:17:41, 166.55s/it][A
+                                                        [A{'loss': 0.0179, 'grad_norm': 0.00521192466840148, 'learning_rate': 1e-05, 'num_tokens': 124389325.0, 'completions/mean_length': 7214.5546875, 'completions/min_length': 493.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6684.0908203125, 'completions/min_terminated_length': 493.0, 'completions/max_terminated_length': 15071.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.26538968086242676, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02030467614531517, 'sampling/sampling_logp_difference/max': 3.246713638305664, 'sampling/importance_sampling_ratio/min': 0.03890184313058853, 'sampling/importance_sampling_ratio/mean': 1.0000994205474854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9393481463193893, 'clip_ratio/low_mean': 4.231535649523721e-05, 'clip_ratio/low_min': 3.3862490909086773e-06, 'clip_ratio/high_mean': 2.778689122351352e-06, 'clip_ratio/high_max': 7.918152277852641e-06, 'clip_ratio/region_mean': 4.509404539021489e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 153/1024 [6:41:40<40:17:41, 166.55s/it][AINFO 12-01 20:06:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:06:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:06:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:06:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 154/1024 [6:44:32<40:41:12, 168.36s/it][A
+                                                        [A{'loss': 0.0557, 'grad_norm': 0.0034769594203680754, 'learning_rate': 1e-05, 'num_tokens': 125344827.0, 'completions/mean_length': 7307.296875, 'completions/min_length': 656.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6938.32470703125, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 15349.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.35035035014152527, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0197945274412632, 'sampling/sampling_logp_difference/max': 9.88245964050293, 'sampling/importance_sampling_ratio/min': 5.1062532293144614e-05, 'sampling/importance_sampling_ratio/mean': 0.9999738335609436, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9287968128919601, 'clip_ratio/low_mean': 4.0359405488743505e-05, 'clip_ratio/low_min': 3.400342848181026e-06, 'clip_ratio/high_mean': 3.274841219536029e-06, 'clip_ratio/high_max': 1.3099364878144115e-05, 'clip_ratio/region_mean': 4.363424682196637e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 154/1024 [6:44:32<40:41:12, 168.36s/it][AINFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 155/1024 [6:47:15<40:14:28, 166.71s/it][A
+                                                        [A{'loss': 0.058, 'grad_norm': 0.005860861856490374, 'learning_rate': 1e-05, 'num_tokens': 126294060.0, 'completions/mean_length': 7255.5703125, 'completions/min_length': 401.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7110.6748046875, 'completions/min_terminated_length': 401.0, 'completions/max_terminated_length': 14940.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.29719966650009155, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019849762320518494, 'sampling/sampling_logp_difference/max': 6.374942779541016, 'sampling/importance_sampling_ratio/min': 0.0017037172801792622, 'sampling/importance_sampling_ratio/mean': 0.9999392032623291, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9288185387849808, 'clip_ratio/low_mean': 3.123730675724801e-05, 'clip_ratio/low_min': 4.124868155486183e-06, 'clip_ratio/high_mean': 1.607209924259223e-06, 'clip_ratio/high_max': 6.428839697036892e-06, 'clip_ratio/region_mean': 3.284451713625458e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 155/1024 [6:47:15<40:14:28, 166.71s/it][AINFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 156/1024 [6:50:02<40:11:21, 166.68s/it][A
+                                                        [A{'loss': 0.0365, 'grad_norm': 0.004109901376068592, 'learning_rate': 1e-05, 'num_tokens': 127163746.0, 'completions/mean_length': 6662.796875, 'completions/min_length': 402.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6429.48828125, 'completions/min_terminated_length': 402.0, 'completions/max_terminated_length': 16174.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2782978415489197, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018542557954788208, 'sampling/sampling_logp_difference/max': 6.249782562255859, 'sampling/importance_sampling_ratio/min': 0.001930873841047287, 'sampling/importance_sampling_ratio/mean': 0.9998985528945923, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8072321340441704, 'clip_ratio/low_mean': 4.209472854199703e-05, 'clip_ratio/low_min': 3.21056154461985e-06, 'clip_ratio/high_mean': 2.8721049147861777e-06, 'clip_ratio/high_max': 1.148841965914471e-05, 'clip_ratio/region_mean': 4.496683322940953e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 156/1024 [6:50:02<40:11:21, 166.68s/it][AINFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 157/1024 [6:52:51<40:18:06, 167.34s/it][A
+                                                        [A{'loss': 0.1272, 'grad_norm': 0.005437003914266825, 'learning_rate': 1e-05, 'num_tokens': 128035690.0, 'completions/mean_length': 6638.5625, 'completions/min_length': 730.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6483.87353515625, 'completions/min_terminated_length': 730.0, 'completions/max_terminated_length': 16168.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.325370192527771, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019497953355312347, 'sampling/sampling_logp_difference/max': 7.152168273925781, 'sampling/importance_sampling_ratio/min': 0.0007831641123630106, 'sampling/importance_sampling_ratio/mean': 0.9999808073043823, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9228496253490448, 'clip_ratio/low_mean': 3.845731936280572e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7114781434866018e-06, 'clip_ratio/high_max': 1.4845912573946407e-05, 'clip_ratio/region_mean': 4.216879796103967e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 157/1024 [6:52:51<40:18:06, 167.34s/it][AINFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 158/1024 [6:55:46<40:51:20, 169.84s/it][A
+                                                        [A{'loss': 0.0553, 'grad_norm': 0.004606325179338455, 'learning_rate': 1e-05, 'num_tokens': 129114487.0, 'completions/mean_length': 8279.7890625, 'completions/min_length': 1084.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7810.9501953125, 'completions/min_terminated_length': 1084.0, 'completions/max_terminated_length': 16133.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2998581528663635, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02114839106798172, 'sampling/sampling_logp_difference/max': 11.899483680725098, 'sampling/importance_sampling_ratio/min': 6.793912234570598e-06, 'sampling/importance_sampling_ratio/mean': 0.9999224543571472, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9365477114915848, 'clip_ratio/low_mean': 5.087737986286811e-05, 'clip_ratio/low_min': 1.7309419035882456e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.087737986286811e-05, 'epoch': 0.15}
+
+ 15%|█▌        | 158/1024 [6:55:46<40:51:20, 169.84s/it][AINFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 159/1024 [6:58:46<41:29:01, 172.65s/it][A
+                                                        [A{'loss': 0.0979, 'grad_norm': 0.0032216343097388744, 'learning_rate': 1e-05, 'num_tokens': 130011934.0, 'completions/mean_length': 6874.5546875, 'completions/min_length': 379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6406.87646484375, 'completions/min_terminated_length': 379.0, 'completions/max_terminated_length': 15157.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.28801077604293823, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01938377134501934, 'sampling/sampling_logp_difference/max': 5.874353408813477, 'sampling/importance_sampling_ratio/min': 0.0028106109239161015, 'sampling/importance_sampling_ratio/mean': 0.9999432563781738, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8596161976456642, 'clip_ratio/low_mean': 4.6293902641991735e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.731617188255768e-06, 'clip_ratio/high_max': 2.8393386855896097e-05, 'clip_ratio/region_mean': 5.402551937550015e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 159/1024 [6:58:46<41:29:01, 172.65s/it][AINFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 160/1024 [7:01:47<42:04:04, 175.28s/it][A
+                                                        [A{'loss': 0.0401, 'grad_norm': 0.0032756594009697437, 'learning_rate': 1e-05, 'num_tokens': 130870045.0, 'completions/mean_length': 6554.3671875, 'completions/min_length': 957.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6154.78857421875, 'completions/min_terminated_length': 957.0, 'completions/max_terminated_length': 16193.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3006146550178528, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019287925213575363, 'sampling/sampling_logp_difference/max': 18.499998092651367, 'sampling/importance_sampling_ratio/min': 9.237467679668043e-09, 'sampling/importance_sampling_ratio/mean': 0.9999619722366333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9097465947270393, 'clip_ratio/low_mean': 2.8597237701433187e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.425736511213472e-06, 'clip_ratio/high_max': 9.702946044853888e-06, 'clip_ratio/region_mean': 3.1022973985272984e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 160/1024 [7:01:47<42:04:04, 175.28s/it][AINFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 161/1024 [7:04:34<41:25:53, 172.83s/it][A
+                                                        [A{'loss': 0.069, 'grad_norm': 0.003530750283971429, 'learning_rate': 1e-05, 'num_tokens': 131812236.0, 'completions/mean_length': 7199.9921875, 'completions/min_length': 431.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6903.73388671875, 'completions/min_terminated_length': 431.0, 'completions/max_terminated_length': 15371.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.30221718549728394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02212757244706154, 'sampling/sampling_logp_difference/max': 12.864561080932617, 'sampling/importance_sampling_ratio/min': 2.5881658984872047e-06, 'sampling/importance_sampling_ratio/mean': 0.9999665021896362, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9904173016548157, 'clip_ratio/low_mean': 4.071546266004589e-05, 'clip_ratio/low_min': 2.701884795897058e-06, 'clip_ratio/high_mean': 5.969264975647093e-06, 'clip_ratio/high_max': 2.387705990258837e-05, 'clip_ratio/region_mean': 4.6684727863066655e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 161/1024 [7:04:34<41:25:53, 172.83s/it][AINFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 162/1024 [7:07:13<40:21:26, 168.55s/it][A
+                                                        [A{'loss': 0.0287, 'grad_norm': 0.004500554408878088, 'learning_rate': 1e-05, 'num_tokens': 132711448.0, 'completions/mean_length': 6822.59375, 'completions/min_length': 139.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6670.82568359375, 'completions/min_terminated_length': 139.0, 'completions/max_terminated_length': 16281.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02111719362437725, 'sampling/sampling_logp_difference/max': 15.995189666748047, 'sampling/importance_sampling_ratio/min': 1.1307781022651398e-07, 'sampling/importance_sampling_ratio/mean': 0.9998499751091003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0052980855107307, 'clip_ratio/low_mean': 4.526082898337336e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.81041513467062e-06, 'clip_ratio/high_max': 1.924166053868248e-05, 'clip_ratio/region_mean': 5.007124354960979e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 162/1024 [7:07:13<40:21:26, 168.55s/it][AINFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 163/1024 [7:10:16<41:21:09, 172.90s/it][A
+                                                        [A{'loss': 0.0782, 'grad_norm': 0.0020288117229938507, 'learning_rate': 1e-05, 'num_tokens': 133729832.0, 'completions/mean_length': 7792.9375, 'completions/min_length': 957.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7515.80615234375, 'completions/min_terminated_length': 957.0, 'completions/max_terminated_length': 16109.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2501322627067566, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020228523761034012, 'sampling/sampling_logp_difference/max': 6.4299726486206055, 'sampling/importance_sampling_ratio/min': 0.001612494932487607, 'sampling/importance_sampling_ratio/mean': 0.9999821782112122, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9114394783973694, 'clip_ratio/low_mean': 1.9409651486057555e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.331508196424693e-06, 'clip_ratio/high_max': 1.3326032785698771e-05, 'clip_ratio/region_mean': 2.274115956879541e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 163/1024 [7:10:16<41:21:09, 172.90s/it][AINFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 164/1024 [7:13:02<40:49:12, 170.87s/it][A
+                                                        [A{'loss': -0.0036, 'grad_norm': 0.006685085594654083, 'learning_rate': 1e-05, 'num_tokens': 134507182.0, 'completions/mean_length': 5908.671875, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5826.18896484375, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 15171.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01997402310371399, 'sampling/sampling_logp_difference/max': 7.111015796661377, 'sampling/importance_sampling_ratio/min': 0.0008160656434483826, 'sampling/importance_sampling_ratio/mean': 0.9999651908874512, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9869658201932907, 'clip_ratio/low_mean': 2.9356229674704082e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.089760639340966e-06, 'clip_ratio/high_max': 1.2359042557363864e-05, 'clip_ratio/region_mean': 3.244599008667137e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 164/1024 [7:13:02<40:49:12, 170.87s/it][AINFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 165/1024 [7:15:48<40:27:37, 169.57s/it][A
+                                                        [A{'loss': 0.0946, 'grad_norm': 0.003854887094348669, 'learning_rate': 1e-05, 'num_tokens': 135446382.0, 'completions/mean_length': 7188.0, 'completions/min_length': 585.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6735.7373046875, 'completions/min_terminated_length': 585.0, 'completions/max_terminated_length': 16000.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020226184278726578, 'sampling/sampling_logp_difference/max': 6.780747890472412, 'sampling/importance_sampling_ratio/min': 0.0011354254093021154, 'sampling/importance_sampling_ratio/mean': 0.9998975992202759, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9519504383206367, 'clip_ratio/low_mean': 3.215114134036412e-05, 'clip_ratio/low_min': 3.941849627153715e-06, 'clip_ratio/high_mean': 2.1278583517414518e-06, 'clip_ratio/high_max': 8.511433406965807e-06, 'clip_ratio/region_mean': 3.427900014685292e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 165/1024 [7:15:48<40:27:37, 169.57s/it][AINFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 166/1024 [7:18:21<39:10:12, 164.35s/it][A
+                                                        [A{'loss': 0.0055, 'grad_norm': 0.006265874952077866, 'learning_rate': 1e-05, 'num_tokens': 136213233.0, 'completions/mean_length': 5843.5234375, 'completions/min_length': 251.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5676.21484375, 'completions/min_terminated_length': 251.0, 'completions/max_terminated_length': 15712.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021706756204366684, 'sampling/sampling_logp_difference/max': 6.129936218261719, 'sampling/importance_sampling_ratio/min': 0.002176719717681408, 'sampling/importance_sampling_ratio/mean': 0.9999513626098633, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9677107483148575, 'clip_ratio/low_mean': 1.9188738406228367e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.778701175680908e-06, 'clip_ratio/high_max': 7.114804702723632e-06, 'clip_ratio/region_mean': 2.0967439695596113e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 166/1024 [7:18:21<39:10:12, 164.35s/it][AINFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▋        | 167/1024 [7:21:11<39:34:17, 166.23s/it][A
+                                                        [A{'loss': 0.0052, 'grad_norm': 0.0018056798726320267, 'learning_rate': 1e-05, 'num_tokens': 137123405.0, 'completions/mean_length': 6942.15625, 'completions/min_length': 517.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6637.58056640625, 'completions/min_terminated_length': 517.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.172288179397583, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02278529666364193, 'sampling/sampling_logp_difference/max': 3.781208038330078, 'sampling/importance_sampling_ratio/min': 0.022795137017965317, 'sampling/importance_sampling_ratio/mean': 0.9999101161956787, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.076062560081482, 'clip_ratio/low_mean': 2.429895857858355e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4804112424826599e-06, 'clip_ratio/high_max': 5.9216449699306395e-06, 'clip_ratio/region_mean': 3.910307100341015e-06, 'epoch': 0.15}
+
+ 16%|█▋        | 167/1024 [7:21:11<39:34:17, 166.23s/it][AINFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▋        | 168/1024 [7:24:01<39:47:12, 167.33s/it][A
+                                                        [A{'loss': 0.0702, 'grad_norm': 0.002132089575752616, 'learning_rate': 1e-05, 'num_tokens': 138084464.0, 'completions/mean_length': 7368.4609375, 'completions/min_length': 660.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7001.9755859375, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3148210048675537, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020028186962008476, 'sampling/sampling_logp_difference/max': 9.874905586242676, 'sampling/importance_sampling_ratio/min': 5.144971510162577e-05, 'sampling/importance_sampling_ratio/mean': 0.999951958656311, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9278362467885017, 'clip_ratio/low_mean': 4.042915224999888e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.364482027900522e-06, 'clip_ratio/high_max': 2.8421666684153024e-05, 'clip_ratio/region_mean': 4.8793634050525725e-05, 'epoch': 0.15}
+
+ 16%|█▋        | 168/1024 [7:24:01<39:47:12, 167.33s/it][AINFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 169/1024 [7:26:59<40:30:51, 170.59s/it][A
+                                                        [A{'loss': 0.0708, 'grad_norm': 0.003180777421221137, 'learning_rate': 1e-05, 'num_tokens': 139164722.0, 'completions/mean_length': 8278.578125, 'completions/min_length': 1203.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8017.11279296875, 'completions/min_terminated_length': 1203.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020700933411717415, 'sampling/sampling_logp_difference/max': 12.29391098022461, 'sampling/importance_sampling_ratio/min': 4.579544565785909e-06, 'sampling/importance_sampling_ratio/mean': 0.9999357461929321, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9731236174702644, 'clip_ratio/low_mean': 3.8177841361175524e-05, 'clip_ratio/low_min': 9.023873644764535e-06, 'clip_ratio/high_mean': 1.7118109099101275e-06, 'clip_ratio/high_max': 6.84724363964051e-06, 'clip_ratio/region_mean': 3.988965249845933e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 169/1024 [7:26:59<40:30:51, 170.59s/it][AINFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 170/1024 [7:29:55<40:51:20, 172.23s/it][A
+                                                        [A{'loss': 0.0955, 'grad_norm': 0.004162010736763477, 'learning_rate': 1e-05, 'num_tokens': 140109163.0, 'completions/mean_length': 7237.2578125, 'completions/min_length': 1078.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6865.43896484375, 'completions/min_terminated_length': 1078.0, 'completions/max_terminated_length': 16136.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.33903974294662476, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017928704619407654, 'sampling/sampling_logp_difference/max': 10.63192367553711, 'sampling/importance_sampling_ratio/min': 2.4133163606165908e-05, 'sampling/importance_sampling_ratio/mean': 0.9999967813491821, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7624354660511017, 'clip_ratio/low_mean': 4.41923687048984e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.712801448178652e-06, 'clip_ratio/high_max': 2.3081439849192975e-05, 'clip_ratio/region_mean': 5.190517117625859e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 170/1024 [7:29:55<40:51:20, 172.23s/it][AINFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 171/1024 [7:32:53<41:13:03, 173.96s/it][A
+                                                        [A{'loss': 0.0442, 'grad_norm': 0.003527693450450897, 'learning_rate': 1e-05, 'num_tokens': 141063738.0, 'completions/mean_length': 7307.4296875, 'completions/min_length': 290.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7089.59228515625, 'completions/min_terminated_length': 290.0, 'completions/max_terminated_length': 15857.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.22673209011554718, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021664291620254517, 'sampling/sampling_logp_difference/max': 10.455191612243652, 'sampling/importance_sampling_ratio/min': 2.8798374842153862e-05, 'sampling/importance_sampling_ratio/mean': 0.9998871088027954, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9450376927852631, 'clip_ratio/low_mean': 2.0606968291758676e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.502144406615116e-06, 'clip_ratio/high_max': 1.8008577626460465e-05, 'clip_ratio/region_mean': 2.510911281206063e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 171/1024 [7:32:53<41:13:03, 173.96s/it][AINFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 172/1024 [7:35:32<40:06:29, 169.47s/it][A
+                                                        [A{'loss': 0.0778, 'grad_norm': 0.002400327706709504, 'learning_rate': 1e-05, 'num_tokens': 141848599.0, 'completions/mean_length': 5985.9765625, 'completions/min_length': 714.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5736.42431640625, 'completions/min_terminated_length': 714.0, 'completions/max_terminated_length': 16275.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.1922685205936432, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018963739275932312, 'sampling/sampling_logp_difference/max': 18.115007400512695, 'sampling/importance_sampling_ratio/min': 1.3575387924902316e-08, 'sampling/importance_sampling_ratio/mean': 0.9999374151229858, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8568939119577408, 'clip_ratio/low_mean': 3.323748410366534e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.788794740306912e-06, 'clip_ratio/high_max': 1.9155178961227648e-05, 'clip_ratio/region_mean': 3.802627873028541e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 172/1024 [7:35:32<40:06:29, 169.47s/it][AINFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 173/1024 [7:38:24<40:12:47, 170.11s/it][A
+                                                        [A{'loss': 0.0625, 'grad_norm': 0.003575773909687996, 'learning_rate': 1e-05, 'num_tokens': 142902666.0, 'completions/mean_length': 8078.8359375, 'completions/min_length': 594.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7810.92724609375, 'completions/min_terminated_length': 594.0, 'completions/max_terminated_length': 15111.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3022122383117676, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021685753017663956, 'sampling/sampling_logp_difference/max': 13.205151557922363, 'sampling/importance_sampling_ratio/min': 1.8410922848488553e-06, 'sampling/importance_sampling_ratio/mean': 0.9999899864196777, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0634759217500687, 'clip_ratio/low_mean': 4.1565862602510606e-05, 'clip_ratio/low_min': 6.89249168317474e-06, 'clip_ratio/high_mean': 4.978134711564053e-06, 'clip_ratio/high_max': 1.6673273876222083e-05, 'clip_ratio/region_mean': 4.654399640457996e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 173/1024 [7:38:24<40:12:47, 170.11s/it][AINFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 174/1024 [7:41:22<40:42:25, 172.41s/it][A
+                                                        [A{'loss': 0.0364, 'grad_norm': 0.003307635197415948, 'learning_rate': 1e-05, 'num_tokens': 143967484.0, 'completions/mean_length': 8138.515625, 'completions/min_length': 660.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7588.81689453125, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 15876.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.31800350546836853, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02233392372727394, 'sampling/sampling_logp_difference/max': 2.537085771560669, 'sampling/importance_sampling_ratio/min': 0.07909657061100006, 'sampling/importance_sampling_ratio/mean': 1.0000429153442383, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0329038575291634, 'clip_ratio/low_mean': 4.288118509521155e-05, 'clip_ratio/low_min': 7.69851726545312e-06, 'clip_ratio/high_mean': 3.081458999076858e-06, 'clip_ratio/high_max': 1.2325835996307433e-05, 'clip_ratio/region_mean': 4.596264443534892e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 174/1024 [7:41:22<40:42:25, 172.41s/it][AINFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 175/1024 [7:44:27<41:33:00, 176.18s/it][A
+                                                        [A{'loss': 0.0258, 'grad_norm': 0.0022392498794943094, 'learning_rate': 1e-05, 'num_tokens': 145028608.0, 'completions/mean_length': 8144.21875, 'completions/min_length': 828.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7878.4189453125, 'completions/min_terminated_length': 828.0, 'completions/max_terminated_length': 16324.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.20411096513271332, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0203234925866127, 'sampling/sampling_logp_difference/max': 12.749860763549805, 'sampling/importance_sampling_ratio/min': 2.9027246455370914e-06, 'sampling/importance_sampling_ratio/mean': 0.9999473094940186, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9547601044178009, 'clip_ratio/low_mean': 3.4071419804604375e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.789598162664333e-06, 'clip_ratio/high_max': 2.3158392650657333e-05, 'clip_ratio/region_mean': 3.986101773989503e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 175/1024 [7:44:27<41:33:00, 176.18s/it][AINFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 176/1024 [7:47:01<39:58:34, 169.71s/it][A
+                                                        [A{'loss': 0.085, 'grad_norm': 0.005551324691623449, 'learning_rate': 1e-05, 'num_tokens': 145851292.0, 'completions/mean_length': 6289.40625, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6129.1748046875, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 16327.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.327729195356369, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020259611308574677, 'sampling/sampling_logp_difference/max': 5.996909141540527, 'sampling/importance_sampling_ratio/min': 0.0024864254519343376, 'sampling/importance_sampling_ratio/mean': 0.9999369382858276, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9483931511640549, 'clip_ratio/low_mean': 3.57260964847228e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.326393539282435e-06, 'clip_ratio/high_max': 1.330557415712974e-05, 'clip_ratio/region_mean': 3.905248979663156e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 176/1024 [7:47:01<39:58:34, 169.71s/it][AINFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 177/1024 [7:50:09<41:10:47, 175.03s/it][A
+                                                        [A{'loss': 0.0757, 'grad_norm': 0.0038497373461723328, 'learning_rate': 1e-05, 'num_tokens': 147004723.0, 'completions/mean_length': 8855.9296875, 'completions/min_length': 1004.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8354.05859375, 'completions/min_terminated_length': 1004.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02178027108311653, 'sampling/sampling_logp_difference/max': 7.8969340324401855, 'sampling/importance_sampling_ratio/min': 0.0003718819934874773, 'sampling/importance_sampling_ratio/mean': 1.0000008344650269, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.003264345228672, 'clip_ratio/low_mean': 5.073524926046957e-05, 'clip_ratio/low_min': 2.859953838196816e-06, 'clip_ratio/high_mean': 2.086053825678391e-06, 'clip_ratio/high_max': 8.344215302713565e-06, 'clip_ratio/region_mean': 5.282130268824403e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 177/1024 [7:50:09<41:10:47, 175.03s/it][AINFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 178/1024 [7:53:04<41:07:55, 175.03s/it][A
+                                                        [A{'loss': 0.054, 'grad_norm': 0.005027150269597769, 'learning_rate': 1e-05, 'num_tokens': 147996190.0, 'completions/mean_length': 7574.3359375, 'completions/min_length': 856.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7434.50048828125, 'completions/min_terminated_length': 856.0, 'completions/max_terminated_length': 16199.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3316858410835266, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020686112344264984, 'sampling/sampling_logp_difference/max': 12.769495964050293, 'sampling/importance_sampling_ratio/min': 2.846284814950195e-06, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9448538422584534, 'clip_ratio/low_mean': 4.947490833728807e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0921258939997642e-06, 'clip_ratio/high_max': 1.2368503575999057e-05, 'clip_ratio/region_mean': 5.256703434497467e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 178/1024 [7:53:04<41:07:55, 175.03s/it][AINFO 12-01 21:18:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:18:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:18:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:18:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 179/1024 [7:55:52<40:34:07, 172.84s/it][A
+                                                        [A{'loss': 0.0743, 'grad_norm': 0.00325182662345469, 'learning_rate': 1e-05, 'num_tokens': 148931006.0, 'completions/mean_length': 7162.5625, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6787.70703125, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15821.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3492894768714905, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02043815702199936, 'sampling/sampling_logp_difference/max': 15.537620544433594, 'sampling/importance_sampling_ratio/min': 1.7868870827442151e-07, 'sampling/importance_sampling_ratio/mean': 0.9999456405639648, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8928515017032623, 'clip_ratio/low_mean': 3.363430948866153e-05, 'clip_ratio/low_min': 3.5745945297094295e-06, 'clip_ratio/high_mean': 4.189188416603429e-06, 'clip_ratio/high_max': 1.6756753666413715e-05, 'clip_ratio/region_mean': 3.7823498018951796e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 179/1024 [7:55:52<40:34:07, 172.84s/it][AINFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 180/1024 [7:58:56<41:20:23, 176.33s/it][A
+                                                        [A{'loss': 0.0388, 'grad_norm': 0.003250610316172242, 'learning_rate': 1e-05, 'num_tokens': 149968481.0, 'completions/mean_length': 7958.2109375, 'completions/min_length': 809.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7396.4921875, 'completions/min_terminated_length': 809.0, 'completions/max_terminated_length': 16163.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2858940362930298, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020478684455156326, 'sampling/sampling_logp_difference/max': 13.499983787536621, 'sampling/importance_sampling_ratio/min': 1.370981294712692e-06, 'sampling/importance_sampling_ratio/mean': 0.999974250793457, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8763524517416954, 'clip_ratio/low_mean': 2.8009484594804235e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.204079798204475e-06, 'clip_ratio/high_max': 2.08163191928179e-05, 'clip_ratio/region_mean': 3.3213564165635034e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 180/1024 [7:58:56<41:20:23, 176.33s/it][AINFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 181/1024 [8:01:55<41:29:37, 177.20s/it][A
+                                                        [A{'loss': 0.019, 'grad_norm': 0.004865634720772505, 'learning_rate': 1e-05, 'num_tokens': 150768791.0, 'completions/mean_length': 6120.296875, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5789.20947265625, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15728.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01739395596086979, 'sampling/sampling_logp_difference/max': 10.249953269958496, 'sampling/importance_sampling_ratio/min': 3.535915311658755e-05, 'sampling/importance_sampling_ratio/mean': 0.9999062418937683, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7507334873080254, 'clip_ratio/low_mean': 1.937760777082076e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.918068043480162e-06, 'clip_ratio/high_max': 1.4398233361134771e-05, 'clip_ratio/region_mean': 2.4295676269048272e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 181/1024 [8:01:55<41:29:37, 177.20s/it][AINFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 182/1024 [8:05:08<42:33:11, 181.94s/it][A
+                                                        [A{'loss': 0.0609, 'grad_norm': 0.0027805580757558346, 'learning_rate': 1e-05, 'num_tokens': 151844301.0, 'completions/mean_length': 8231.671875, 'completions/min_length': 1231.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 7230.5087890625, 'completions/min_terminated_length': 1231.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.35088711977005005, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019971080124378204, 'sampling/sampling_logp_difference/max': 6.454617977142334, 'sampling/importance_sampling_ratio/min': 0.0015732402680441737, 'sampling/importance_sampling_ratio/mean': 0.999957799911499, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8613645136356354, 'clip_ratio/low_mean': 5.480891331899329e-05, 'clip_ratio/low_min': 9.078275525098434e-06, 'clip_ratio/high_mean': 2.9266581691445026e-06, 'clip_ratio/high_max': 1.170663267657801e-05, 'clip_ratio/region_mean': 5.773557131760754e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 182/1024 [8:05:08<42:33:11, 181.94s/it][AINFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 183/1024 [8:07:36<40:07:34, 171.77s/it][A
+                                                        [A{'loss': 0.0534, 'grad_norm': 0.0028903940692543983, 'learning_rate': 1e-05, 'num_tokens': 152638356.0, 'completions/mean_length': 6038.4921875, 'completions/min_length': 769.0, 'completions/max_length': 15682.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6038.4921875, 'completions/min_terminated_length': 769.0, 'completions/max_terminated_length': 15682.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3022122383117676, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019382324069738388, 'sampling/sampling_logp_difference/max': 12.374916076660156, 'sampling/importance_sampling_ratio/min': 4.2232054511259776e-06, 'sampling/importance_sampling_ratio/mean': 0.9999019503593445, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8801494240760803, 'clip_ratio/low_mean': 4.333486742780224e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.714717084018048e-06, 'clip_ratio/high_max': 1.0858868336072192e-05, 'clip_ratio/region_mean': 4.60495848528808e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 183/1024 [8:07:36<40:07:34, 171.77s/it][AINFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 184/1024 [8:10:33<40:25:51, 173.28s/it][A
+                                                        [A{'loss': 0.0796, 'grad_norm': 0.0029546513687819242, 'learning_rate': 1e-05, 'num_tokens': 153618418.0, 'completions/mean_length': 7506.921875, 'completions/min_length': 557.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7070.34375, 'completions/min_terminated_length': 557.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3448137044906616, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01928526908159256, 'sampling/sampling_logp_difference/max': 14.616228103637695, 'sampling/importance_sampling_ratio/min': 4.4900667717229226e-07, 'sampling/importance_sampling_ratio/mean': 1.0000388622283936, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8713229671120644, 'clip_ratio/low_mean': 4.994629193788569e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.830143276038143e-06, 'clip_ratio/high_max': 7.320573104152572e-06, 'clip_ratio/region_mean': 5.177643492970674e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 184/1024 [8:10:33<40:25:51, 173.28s/it][AINFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 185/1024 [8:13:19<39:51:20, 171.01s/it][A
+                                                        [A{'loss': 0.0837, 'grad_norm': 0.002384800696745515, 'learning_rate': 1e-05, 'num_tokens': 154502440.0, 'completions/mean_length': 6725.921875, 'completions/min_length': 253.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6649.8740234375, 'completions/min_terminated_length': 253.0, 'completions/max_terminated_length': 13999.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020737573504447937, 'sampling/sampling_logp_difference/max': 7.082281589508057, 'sampling/importance_sampling_ratio/min': 0.0008398547652177513, 'sampling/importance_sampling_ratio/mean': 0.9999340772628784, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9011344686150551, 'clip_ratio/low_mean': 2.8494011758084525e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2481475500389934e-06, 'clip_ratio/high_max': 1.2992590200155973e-05, 'clip_ratio/region_mean': 3.174215930812352e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 185/1024 [8:13:19<39:51:20, 171.01s/it][AINFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-01 21:40:22,616 - math_verify.grader - WARNING - Timeout during comparison
+
+ 18%|█▊        | 186/1024 [8:16:26<40:56:32, 175.89s/it][A
+                                                        [A{'loss': 0.0678, 'grad_norm': 0.0033664393704384565, 'learning_rate': 1e-05, 'num_tokens': 155454988.0, 'completions/mean_length': 7285.78125, 'completions/min_length': 1176.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6992.2900390625, 'completions/min_terminated_length': 1176.0, 'completions/max_terminated_length': 15862.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022076331079006195, 'sampling/sampling_logp_difference/max': 7.873225212097168, 'sampling/importance_sampling_ratio/min': 0.0003808041801676154, 'sampling/importance_sampling_ratio/mean': 0.999931275844574, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.028538629412651, 'clip_ratio/low_mean': 3.7723172567893926e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.414224342028319e-06, 'clip_ratio/high_max': 2.686360085135675e-05, 'clip_ratio/region_mean': 4.5137397364669596e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 186/1024 [8:16:26<40:56:32, 175.89s/it][AINFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 187/1024 [8:19:43<42:23:03, 182.30s/it][A
+                                                        [A{'loss': 0.0995, 'grad_norm': 0.0029569920152425766, 'learning_rate': 1e-05, 'num_tokens': 156439609.0, 'completions/mean_length': 7546.1015625, 'completions/min_length': 794.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6956.90869140625, 'completions/min_terminated_length': 794.0, 'completions/max_terminated_length': 16380.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.305637001991272, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021088771522045135, 'sampling/sampling_logp_difference/max': 4.609542369842529, 'sampling/importance_sampling_ratio/min': 0.009956372901797295, 'sampling/importance_sampling_ratio/mean': 0.9999749660491943, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9216663613915443, 'clip_ratio/low_mean': 3.613749231590191e-05, 'clip_ratio/low_min': 6.27866324975912e-06, 'clip_ratio/high_mean': 2.9093872626617667e-06, 'clip_ratio/high_max': 1.1637549050647067e-05, 'clip_ratio/region_mean': 3.904687946487684e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 187/1024 [8:19:43<42:23:03, 182.30s/it][AINFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 188/1024 [8:22:51<42:40:40, 183.78s/it][A
+                                                        [A{'loss': 0.0039, 'grad_norm': 0.0023973705247044563, 'learning_rate': 1e-05, 'num_tokens': 157343374.0, 'completions/mean_length': 6866.6015625, 'completions/min_length': 866.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6791.66162109375, 'completions/min_terminated_length': 866.0, 'completions/max_terminated_length': 16271.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2511882185935974, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021616388112306595, 'sampling/sampling_logp_difference/max': 9.502913475036621, 'sampling/importance_sampling_ratio/min': 7.46340665500611e-05, 'sampling/importance_sampling_ratio/mean': 0.9999228715896606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9553637430071831, 'clip_ratio/low_mean': 1.9624552805908024e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6212559330597287e-06, 'clip_ratio/high_max': 6.485023732238915e-06, 'clip_ratio/region_mean': 2.1245808738967753e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 188/1024 [8:22:51<42:40:40, 183.78s/it][AINFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 189/1024 [8:25:47<42:07:39, 181.63s/it][A
+                                                        [A{'loss': 0.0056, 'grad_norm': 0.0023072708863765, 'learning_rate': 1e-05, 'num_tokens': 158173719.0, 'completions/mean_length': 6335.9453125, 'completions/min_length': 469.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5754.65283203125, 'completions/min_terminated_length': 469.0, 'completions/max_terminated_length': 14284.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018506702035665512, 'sampling/sampling_logp_difference/max': 8.732585906982422, 'sampling/importance_sampling_ratio/min': 0.0001612449559615925, 'sampling/importance_sampling_ratio/mean': 0.9998940229415894, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8574290797114372, 'clip_ratio/low_mean': 3.832016966498486e-05, 'clip_ratio/low_min': 5.240211066848133e-06, 'clip_ratio/high_mean': 2.2777185222366825e-06, 'clip_ratio/high_max': 9.11087408894673e-06, 'clip_ratio/region_mean': 4.059788818722154e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 189/1024 [8:25:47<42:07:39, 181.63s/it][AINFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▊        | 190/1024 [8:28:55<42:30:26, 183.49s/it][A
+                                                        [A{'loss': 0.041, 'grad_norm': 0.004400993697345257, 'learning_rate': 1e-05, 'num_tokens': 159248410.0, 'completions/mean_length': 8239.8984375, 'completions/min_length': 1080.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7768.751953125, 'completions/min_terminated_length': 1080.0, 'completions/max_terminated_length': 15951.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.32325342297554016, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02096184343099594, 'sampling/sampling_logp_difference/max': 13.686293601989746, 'sampling/importance_sampling_ratio/min': 1.1379369198039058e-06, 'sampling/importance_sampling_ratio/mean': 0.9998342990875244, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8983379155397415, 'clip_ratio/low_mean': 5.610333710137638e-05, 'clip_ratio/low_min': 1.3168393707019277e-05, 'clip_ratio/high_mean': 9.993626633786334e-06, 'clip_ratio/high_max': 3.0578403084291494e-05, 'clip_ratio/region_mean': 6.609696265513776e-05, 'epoch': 0.17}
+
+ 19%|█▊        | 190/1024 [8:28:55<42:30:26, 183.49s/it][AINFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▊        | 191/1024 [8:31:39<41:04:22, 177.51s/it][A
+                                                        [A{'loss': 0.0723, 'grad_norm': 0.00661451555788517, 'learning_rate': 1e-05, 'num_tokens': 160109904.0, 'completions/mean_length': 6580.921875, 'completions/min_length': 727.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 5659.26513671875, 'completions/min_terminated_length': 727.0, 'completions/max_terminated_length': 13741.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3874102830886841, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017984790727496147, 'sampling/sampling_logp_difference/max': 7.927308082580566, 'sampling/importance_sampling_ratio/min': 0.00036075623938813806, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8277688398957253, 'clip_ratio/low_mean': 6.66748674120754e-05, 'clip_ratio/low_min': 1.5295650428015506e-05, 'clip_ratio/high_mean': 2.2566434836335247e-06, 'clip_ratio/high_max': 9.026573934534099e-06, 'clip_ratio/region_mean': 6.89315111230826e-05, 'epoch': 0.18}
+
+ 19%|█▊        | 191/1024 [8:31:39<41:04:22, 177.51s/it][AINFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 192/1024 [8:34:29<40:30:01, 175.24s/it][A
+                                                        [A{'loss': 0.0368, 'grad_norm': 0.004417019430547953, 'learning_rate': 1e-05, 'num_tokens': 161103384.0, 'completions/mean_length': 7627.0, 'completions/min_length': 1916.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7416.83251953125, 'completions/min_terminated_length': 1916.0, 'completions/max_terminated_length': 16027.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3634909689426422, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01947963796555996, 'sampling/sampling_logp_difference/max': 9.937321662902832, 'sampling/importance_sampling_ratio/min': 4.833659477299079e-05, 'sampling/importance_sampling_ratio/mean': 0.9998986721038818, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8832443356513977, 'clip_ratio/low_mean': 4.045673085784074e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8589515207168006e-06, 'clip_ratio/high_max': 7.435806082867202e-06, 'clip_ratio/region_mean': 4.2315682549087796e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 192/1024 [8:34:29<40:30:01, 175.24s/it][AINFO 12-01 21:59:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:59:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:59:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:59:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 193/1024 [8:37:33<41:05:52, 178.04s/it][A
+                                                        [A{'loss': 0.0426, 'grad_norm': 0.0030983765609562397, 'learning_rate': 1e-05, 'num_tokens': 162199765.0, 'completions/mean_length': 8426.1015625, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7965.72705078125, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 16073.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.2540663480758667, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02070600539445877, 'sampling/sampling_logp_difference/max': 6.999904155731201, 'sampling/importance_sampling_ratio/min': 0.0009119694004766643, 'sampling/importance_sampling_ratio/mean': 0.9999411106109619, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8188603445887566, 'clip_ratio/low_mean': 2.6134909091979353e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.485296079157706e-06, 'clip_ratio/high_max': 9.941184316630824e-06, 'clip_ratio/region_mean': 2.8620205910101504e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 193/1024 [8:37:33<41:05:52, 178.04s/it][AINFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 194/1024 [8:40:32<41:06:50, 178.33s/it][A
+                                                        [A{'loss': 0.052, 'grad_norm': 0.003430198412388563, 'learning_rate': 1e-05, 'num_tokens': 163133232.0, 'completions/mean_length': 7154.2109375, 'completions/min_length': 1387.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6856.4755859375, 'completions/min_terminated_length': 1387.0, 'completions/max_terminated_length': 15904.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2120065689086914, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02190260961651802, 'sampling/sampling_logp_difference/max': 7.753361225128174, 'sampling/importance_sampling_ratio/min': 0.00042929715709760785, 'sampling/importance_sampling_ratio/mean': 1.0000275373458862, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9913735538721085, 'clip_ratio/low_mean': 3.7853451885894174e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.530347718580742e-06, 'clip_ratio/high_max': 2.612139087432297e-05, 'clip_ratio/region_mean': 4.438379949078808e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 194/1024 [8:40:32<41:06:50, 178.33s/it][AINFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 195/1024 [8:43:39<41:39:41, 180.92s/it][A
+                                                        [A{'loss': 0.0449, 'grad_norm': 0.002780586015433073, 'learning_rate': 1e-05, 'num_tokens': 164134393.0, 'completions/mean_length': 7693.1328125, 'completions/min_length': 1077.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7412.7822265625, 'completions/min_terminated_length': 1077.0, 'completions/max_terminated_length': 16252.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.20411095023155212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021110571920871735, 'sampling/sampling_logp_difference/max': 14.848588943481445, 'sampling/importance_sampling_ratio/min': 3.559096626304381e-07, 'sampling/importance_sampling_ratio/mean': 0.9999028444290161, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9887127950787544, 'clip_ratio/low_mean': 3.384581600585079e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.960363745951327e-07, 'clip_ratio/high_max': 3.1841454983805306e-06, 'clip_ratio/region_mean': 3.4641852380445926e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 195/1024 [8:43:39<41:39:41, 180.92s/it][AINFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 196/1024 [8:46:24<40:28:27, 175.98s/it][A
+                                                        [A{'loss': 0.0541, 'grad_norm': 0.0030156150460243225, 'learning_rate': 1e-05, 'num_tokens': 165063412.0, 'completions/mean_length': 7072.1484375, 'completions/min_length': 695.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6771.76611328125, 'completions/min_terminated_length': 695.0, 'completions/max_terminated_length': 16129.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019325289875268936, 'sampling/sampling_logp_difference/max': 12.999247550964355, 'sampling/importance_sampling_ratio/min': 2.2620308754994767e-06, 'sampling/importance_sampling_ratio/mean': 0.9998926520347595, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.861792616546154, 'clip_ratio/low_mean': 5.182203130971175e-05, 'clip_ratio/low_min': 1.5574546068819473e-05, 'clip_ratio/high_mean': 5.008155312680174e-06, 'clip_ratio/high_max': 9.770586984814145e-06, 'clip_ratio/region_mean': 5.683018616764457e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 196/1024 [8:46:24<40:28:27, 175.98s/it][AINFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 197/1024 [8:49:20<40:26:03, 176.01s/it][A
+                                                        [A{'loss': 0.0161, 'grad_norm': 0.0034921523183584213, 'learning_rate': 1e-05, 'num_tokens': 166024306.0, 'completions/mean_length': 7353.421875, 'completions/min_length': 916.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7062.11279296875, 'completions/min_terminated_length': 916.0, 'completions/max_terminated_length': 15062.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019593238830566406, 'sampling/sampling_logp_difference/max': 7.576326847076416, 'sampling/importance_sampling_ratio/min': 0.0005124400486238301, 'sampling/importance_sampling_ratio/mean': 0.9999784231185913, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8961873054504395, 'clip_ratio/low_mean': 6.156819108582567e-05, 'clip_ratio/low_min': 5.763157332694391e-06, 'clip_ratio/high_mean': 6.455301331698138e-06, 'clip_ratio/high_max': 2.2510209873871645e-05, 'clip_ratio/region_mean': 6.802349253121065e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 197/1024 [8:49:20<40:26:03, 176.01s/it][AINFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 198/1024 [8:52:00<39:18:16, 171.30s/it][A
+                                                        [A{'loss': 0.0635, 'grad_norm': 0.0027784397825598717, 'learning_rate': 1e-05, 'num_tokens': 166984982.0, 'completions/mean_length': 7348.03125, 'completions/min_length': 1619.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6903.63916015625, 'completions/min_terminated_length': 1619.0, 'completions/max_terminated_length': 15604.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3437528908252716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01857386901974678, 'sampling/sampling_logp_difference/max': 6.905689716339111, 'sampling/importance_sampling_ratio/min': 0.0010020677000284195, 'sampling/importance_sampling_ratio/mean': 1.0000090599060059, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.824029266834259, 'clip_ratio/low_mean': 5.347559840629401e-05, 'clip_ratio/low_min': 6.613406640099129e-06, 'clip_ratio/high_mean': 4.292725350296678e-06, 'clip_ratio/high_max': 1.3040991007073899e-05, 'clip_ratio/region_mean': 5.776832381343411e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 198/1024 [8:52:00<39:18:16, 171.30s/it][AINFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 199/1024 [8:54:49<39:05:55, 170.61s/it][A
+                                                        [A{'loss': 0.0165, 'grad_norm': 0.004110465291887522, 'learning_rate': 1e-05, 'num_tokens': 167936971.0, 'completions/mean_length': 7290.4765625, 'completions/min_length': 471.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6920.82080078125, 'completions/min_terminated_length': 471.0, 'completions/max_terminated_length': 16358.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.35901516675949097, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019696572795510292, 'sampling/sampling_logp_difference/max': 13.219663619995117, 'sampling/importance_sampling_ratio/min': 1.8145670992453233e-06, 'sampling/importance_sampling_ratio/mean': 0.9999493360519409, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8884479627013206, 'clip_ratio/low_mean': 3.2080681648949394e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0969530649163062e-05, 'clip_ratio/high_max': 3.330808067403268e-05, 'clip_ratio/region_mean': 4.3050211388617754e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 199/1024 [8:54:49<39:05:55, 170.61s/it][AINFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 200/1024 [8:57:56<40:09:34, 175.45s/it][A
+                                                        [A{'loss': 0.1147, 'grad_norm': 0.002410614863038063, 'learning_rate': 1e-05, 'num_tokens': 168955683.0, 'completions/mean_length': 7803.625, 'completions/min_length': 929.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 6833.66943359375, 'completions/min_terminated_length': 929.0, 'completions/max_terminated_length': 15824.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018545793369412422, 'sampling/sampling_logp_difference/max': 7.035423755645752, 'sampling/importance_sampling_ratio/min': 0.0008801451185718179, 'sampling/importance_sampling_ratio/mean': 0.999977707862854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8326860442757607, 'clip_ratio/low_mean': 3.466498992565903e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4433944076918124e-06, 'clip_ratio/high_max': 9.77357763076725e-06, 'clip_ratio/region_mean': 3.710838473125477e-05, 'epoch': 0.18}
+
+ 20%|█▉        | 200/1024 [8:57:56<40:09:34, 175.45s/it][AINFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 201/1024 [9:00:32<38:48:24, 169.75s/it][A
+                                                        [A{'loss': 0.0499, 'grad_norm': 0.0034376555122435093, 'learning_rate': 1e-05, 'num_tokens': 169845823.0, 'completions/mean_length': 6804.34375, 'completions/min_length': 645.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6495.322265625, 'completions/min_terminated_length': 645.0, 'completions/max_terminated_length': 16272.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.31534504890441895, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020515555515885353, 'sampling/sampling_logp_difference/max': 17.850955963134766, 'sampling/importance_sampling_ratio/min': 1.767780588579626e-08, 'sampling/importance_sampling_ratio/mean': 1.0000131130218506, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9669496119022369, 'clip_ratio/low_mean': 3.4781527119776e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6505314824353263e-06, 'clip_ratio/high_max': 1.4602125929741305e-05, 'clip_ratio/region_mean': 3.8432058772741584e-05, 'epoch': 0.18}
+
+ 20%|█▉        | 201/1024 [9:00:32<38:48:24, 169.75s/it][AINFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 202/1024 [9:03:28<39:11:15, 171.63s/it][A
+                                                        [A{'loss': 0.1046, 'grad_norm': 0.0026675171684473753, 'learning_rate': 1e-05, 'num_tokens': 170738210.0, 'completions/mean_length': 6827.9609375, 'completions/min_length': 156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6105.23583984375, 'completions/min_terminated_length': 156.0, 'completions/max_terminated_length': 16350.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2698654532432556, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019948139786720276, 'sampling/sampling_logp_difference/max': 5.840882778167725, 'sampling/importance_sampling_ratio/min': 0.002906275913119316, 'sampling/importance_sampling_ratio/mean': 1.000019907951355, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8833946585655212, 'clip_ratio/low_mean': 3.574208744794305e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.953680618451472e-06, 'clip_ratio/high_max': 1.5814722473805887e-05, 'clip_ratio/region_mean': 3.9695768407455034e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 202/1024 [9:03:28<39:11:15, 171.63s/it][AINFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 203/1024 [9:06:14<38:42:59, 169.77s/it][A
+                                                        [A{'loss': 0.034, 'grad_norm': 0.0039620306342840195, 'learning_rate': 1e-05, 'num_tokens': 171705152.0, 'completions/mean_length': 7377.984375, 'completions/min_length': 556.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7307.07080078125, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 15725.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01964445412158966, 'sampling/sampling_logp_difference/max': 10.614632606506348, 'sampling/importance_sampling_ratio/min': 2.4554079573135823e-05, 'sampling/importance_sampling_ratio/mean': 0.999995231628418, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8881714344024658, 'clip_ratio/low_mean': 6.462372630267055e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.1557804593139736e-06, 'clip_ratio/high_max': 1.6623121837255894e-05, 'clip_ratio/region_mean': 6.877950727357529e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 203/1024 [9:06:14<38:42:59, 169.77s/it][AINFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 204/1024 [9:09:12<39:14:05, 172.25s/it][A
+                                                        [A{'loss': 0.0268, 'grad_norm': 0.0040458571165800095, 'learning_rate': 1e-05, 'num_tokens': 172501881.0, 'completions/mean_length': 6051.8828125, 'completions/min_length': 819.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5543.74560546875, 'completions/min_terminated_length': 819.0, 'completions/max_terminated_length': 15265.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01957303285598755, 'sampling/sampling_logp_difference/max': 6.120361804962158, 'sampling/importance_sampling_ratio/min': 0.0021976607386022806, 'sampling/importance_sampling_ratio/mean': 0.9999410510063171, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8851477280259132, 'clip_ratio/low_mean': 2.775239624952519e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.409777835055138e-06, 'clip_ratio/high_max': 9.639111340220552e-06, 'clip_ratio/region_mean': 3.0162174198267167e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 204/1024 [9:09:12<39:14:05, 172.25s/it][AINFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 205/1024 [9:12:01<38:58:01, 171.28s/it][A
+                                                        [A{'loss': 0.0427, 'grad_norm': 0.005941574461758137, 'learning_rate': 1e-05, 'num_tokens': 173522391.0, 'completions/mean_length': 7830.171875, 'completions/min_length': 954.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7409.4912109375, 'completions/min_terminated_length': 954.0, 'completions/max_terminated_length': 16034.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.33668074011802673, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021295130252838135, 'sampling/sampling_logp_difference/max': 9.052275657653809, 'sampling/importance_sampling_ratio/min': 0.00011712420382536948, 'sampling/importance_sampling_ratio/mean': 1.0000017881393433, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9070459827780724, 'clip_ratio/low_mean': 5.158422732165491e-05, 'clip_ratio/low_min': 1.1939961495954776e-05, 'clip_ratio/high_mean': 3.529455852913088e-06, 'clip_ratio/high_max': 9.72708312474424e-06, 'clip_ratio/region_mean': 5.5113683174567996e-05, 'epoch': 0.19}
+
+ 20%|██        | 205/1024 [9:12:01<38:58:01, 171.28s/it][AINFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 206/1024 [9:15:08<40:01:06, 176.12s/it][A
+                                                        [A{'loss': 0.0273, 'grad_norm': 0.0025851845275610685, 'learning_rate': 1e-05, 'num_tokens': 174504534.0, 'completions/mean_length': 7520.6796875, 'completions/min_length': 1321.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6769.55078125, 'completions/min_terminated_length': 1321.0, 'completions/max_terminated_length': 15443.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2188364714384079, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02016005665063858, 'sampling/sampling_logp_difference/max': 7.835196018218994, 'sampling/importance_sampling_ratio/min': 0.00039556476986035705, 'sampling/importance_sampling_ratio/mean': 0.999911367893219, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8843575045466423, 'clip_ratio/low_mean': 1.718775109793569e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3885803582525114e-06, 'clip_ratio/high_max': 5.5543214330100454e-06, 'clip_ratio/region_mean': 1.8576331683561875e-05, 'epoch': 0.19}
+
+ 20%|██        | 206/1024 [9:15:08<40:01:06, 176.12s/it][AINFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 207/1024 [9:18:08<40:15:08, 177.37s/it][A
+                                                        [A{'loss': 0.047, 'grad_norm': 0.004170550964772701, 'learning_rate': 1e-05, 'num_tokens': 175472574.0, 'completions/mean_length': 7382.1875, 'completions/min_length': 934.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6861.42138671875, 'completions/min_terminated_length': 934.0, 'completions/max_terminated_length': 16173.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020749717950820923, 'sampling/sampling_logp_difference/max': 10.481352806091309, 'sampling/importance_sampling_ratio/min': 2.8054744689143263e-05, 'sampling/importance_sampling_ratio/mean': 0.9999932646751404, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.916313610970974, 'clip_ratio/low_mean': 3.617897255026037e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.536370288908074e-06, 'clip_ratio/high_max': 1.0145481155632297e-05, 'clip_ratio/region_mean': 3.871534295285528e-05, 'epoch': 0.19}
+
+ 20%|██        | 207/1024 [9:18:08<40:15:08, 177.37s/it][AINFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 208/1024 [9:20:26<37:31:58, 165.59s/it][A
+                                                        [A{'loss': 0.0447, 'grad_norm': 0.004663965664803982, 'learning_rate': 1e-05, 'num_tokens': 176275568.0, 'completions/mean_length': 6122.453125, 'completions/min_length': 1192.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6041.6533203125, 'completions/min_terminated_length': 1192.0, 'completions/max_terminated_length': 13891.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3284856975078583, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020278753712773323, 'sampling/sampling_logp_difference/max': 11.74999713897705, 'sampling/importance_sampling_ratio/min': 7.88934721640544e-06, 'sampling/importance_sampling_ratio/mean': 0.9999363422393799, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8984386026859283, 'clip_ratio/low_mean': 3.83663013963087e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.83663013963087e-05, 'epoch': 0.19}
+
+ 20%|██        | 208/1024 [9:20:26<37:31:58, 165.59s/it][AINFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 209/1024 [9:22:53<36:10:49, 159.82s/it][A
+                                                        [A{'loss': 0.1066, 'grad_norm': 0.004848882555961609, 'learning_rate': 1e-05, 'num_tokens': 176932549.0, 'completions/mean_length': 4983.2890625, 'completions/min_length': 589.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4709.67236328125, 'completions/min_terminated_length': 589.0, 'completions/max_terminated_length': 15547.0, 'rewards/accuracy_reward/mean': 0.6484375, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.6484375, 'reward_std': 0.2772369980812073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017959970980882645, 'sampling/sampling_logp_difference/max': 11.026308059692383, 'sampling/importance_sampling_ratio/min': 1.626804078114219e-05, 'sampling/importance_sampling_ratio/mean': 0.9999616146087646, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.825260303914547, 'clip_ratio/low_mean': 4.3961883989140915e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6337880828796187e-06, 'clip_ratio/high_max': 1.4535152331518475e-05, 'clip_ratio/region_mean': 4.7595671958333696e-05, 'epoch': 0.19}
+
+ 20%|██        | 209/1024 [9:22:53<36:10:49, 159.82s/it][AINFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 210/1024 [9:25:17<35:04:16, 155.11s/it][A
+                                                        [A{'loss': 0.0977, 'grad_norm': 0.004749474115669727, 'learning_rate': 1e-05, 'num_tokens': 177691752.0, 'completions/mean_length': 5766.5234375, 'completions/min_length': 700.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5511.7041015625, 'completions/min_terminated_length': 700.0, 'completions/max_terminated_length': 15415.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2738044261932373, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019118282943964005, 'sampling/sampling_logp_difference/max': 11.626367568969727, 'sampling/importance_sampling_ratio/min': 8.927558155846782e-06, 'sampling/importance_sampling_ratio/mean': 1.0000141859054565, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9016259610652924, 'clip_ratio/low_mean': 4.2418692146384274e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7854651989400736e-06, 'clip_ratio/high_max': 1.1141860795760294e-05, 'clip_ratio/region_mean': 4.5204157913758536e-05, 'epoch': 0.19}
+
+ 21%|██        | 210/1024 [9:25:17<35:04:16, 155.11s/it][AINFO 12-01 22:50:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:50:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:50:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:50:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 211/1024 [9:27:55<35:12:43, 155.92s/it][A
+                                                        [A{'loss': 0.1135, 'grad_norm': 0.004418120253831148, 'learning_rate': 1e-05, 'num_tokens': 178603454.0, 'completions/mean_length': 6993.671875, 'completions/min_length': 889.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6768.30419921875, 'completions/min_terminated_length': 889.0, 'completions/max_terminated_length': 15696.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01957814022898674, 'sampling/sampling_logp_difference/max': 6.312445640563965, 'sampling/importance_sampling_ratio/min': 0.0018135923892259598, 'sampling/importance_sampling_ratio/mean': 1.000037670135498, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9074988812208176, 'clip_ratio/low_mean': 4.609663824339805e-05, 'clip_ratio/low_min': 3.983555870945565e-06, 'clip_ratio/high_mean': 2.1587275114143267e-06, 'clip_ratio/high_max': 5.5243735914700665e-06, 'clip_ratio/region_mean': 4.8255366664307076e-05, 'epoch': 0.19}
+
+ 21%|██        | 211/1024 [9:27:55<35:12:43, 155.92s/it][AINFO 12-01 22:52:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:52:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:52:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:52:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 212/1024 [9:30:58<37:00:15, 164.06s/it][A
+                                                        [A{'loss': 0.0172, 'grad_norm': 0.00237120408564806, 'learning_rate': 1e-05, 'num_tokens': 179577063.0, 'completions/mean_length': 7445.1328125, 'completions/min_length': 24.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6849.20849609375, 'completions/min_terminated_length': 24.0, 'completions/max_terminated_length': 15316.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.21040897071361542, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02165937051177025, 'sampling/sampling_logp_difference/max': 9.245802879333496, 'sampling/importance_sampling_ratio/min': 9.651589061832055e-05, 'sampling/importance_sampling_ratio/mean': 0.9999725818634033, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9255013465881348, 'clip_ratio/low_mean': 2.7488794444252562e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2817357628591708e-06, 'clip_ratio/high_max': 5.126943051436683e-06, 'clip_ratio/region_mean': 2.877053032079857e-05, 'epoch': 0.2}
+
+ 21%|██        | 212/1024 [9:30:58<37:00:15, 164.06s/it][AINFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 213/1024 [9:33:50<37:32:46, 166.67s/it][A
+                                                        [A{'loss': 0.1291, 'grad_norm': 0.004715202376246452, 'learning_rate': 1e-05, 'num_tokens': 180380422.0, 'completions/mean_length': 6120.5546875, 'completions/min_length': 471.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5703.34130859375, 'completions/min_terminated_length': 471.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.29355230927467346, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018377620726823807, 'sampling/sampling_logp_difference/max': 5.437493324279785, 'sampling/importance_sampling_ratio/min': 0.004350374918431044, 'sampling/importance_sampling_ratio/mean': 0.999874472618103, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8181199952960014, 'clip_ratio/low_mean': 2.6486316301088664e-05, 'clip_ratio/low_min': 3.516273409331916e-06, 'clip_ratio/high_mean': 4.7390736881425255e-06, 'clip_ratio/high_max': 1.8956294752570102e-05, 'clip_ratio/region_mean': 3.122539010291803e-05, 'epoch': 0.2}
+
+ 21%|██        | 213/1024 [9:33:50<37:32:46, 166.67s/it][AINFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 214/1024 [9:36:30<37:01:03, 164.52s/it][A
+                                                        [A{'loss': 0.0311, 'grad_norm': 0.003063712501898408, 'learning_rate': 1e-05, 'num_tokens': 181212776.0, 'completions/mean_length': 6351.203125, 'completions/min_length': 694.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5857.78662109375, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 16005.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3048579692840576, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019523698836565018, 'sampling/sampling_logp_difference/max': 11.74971866607666, 'sampling/importance_sampling_ratio/min': 7.891544555604924e-06, 'sampling/importance_sampling_ratio/mean': 0.9999946355819702, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8798654451966286, 'clip_ratio/low_mean': 3.4097628713425365e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.376495558564784e-06, 'clip_ratio/high_max': 5.594843969447538e-06, 'clip_ratio/region_mean': 3.6474124044616474e-05, 'epoch': 0.2}
+
+ 21%|██        | 214/1024 [9:36:30<37:01:03, 164.52s/it][AINFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 215/1024 [9:39:02<36:08:01, 160.79s/it][A
+                                                        [A{'loss': 0.0924, 'grad_norm': 0.0033194730058312416, 'learning_rate': 1e-05, 'num_tokens': 182041910.0, 'completions/mean_length': 6330.046875, 'completions/min_length': 701.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6170.46044921875, 'completions/min_terminated_length': 701.0, 'completions/max_terminated_length': 14180.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018981872126460075, 'sampling/sampling_logp_difference/max': 9.158197402954102, 'sampling/importance_sampling_ratio/min': 0.00010535263572819531, 'sampling/importance_sampling_ratio/mean': 0.9998994469642639, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8319354206323624, 'clip_ratio/low_mean': 3.544438988001275e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.544438988001275e-05, 'epoch': 0.2}
+
+ 21%|██        | 215/1024 [9:39:02<36:08:01, 160.79s/it][AINFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 216/1024 [9:42:15<38:16:55, 170.56s/it][A
+                                                        [A{'loss': 0.0288, 'grad_norm': 0.004492956213653088, 'learning_rate': 1e-05, 'num_tokens': 182914843.0, 'completions/mean_length': 6665.2890625, 'completions/min_length': 722.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6351.7822265625, 'completions/min_terminated_length': 722.0, 'completions/max_terminated_length': 15982.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.14807432889938354, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02088768407702446, 'sampling/sampling_logp_difference/max': 4.474179744720459, 'sampling/importance_sampling_ratio/min': 0.011399568989872932, 'sampling/importance_sampling_ratio/mean': 1.000030279159546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9336326420307159, 'clip_ratio/low_mean': 1.7156292415165808e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.7156292415165808e-05, 'epoch': 0.2}
+
+ 21%|██        | 216/1024 [9:42:15<38:16:55, 170.56s/it][AINFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 217/1024 [9:44:54<37:24:16, 166.86s/it][A
+                                                        [A{'loss': -0.004, 'grad_norm': 0.003816079581156373, 'learning_rate': 1e-05, 'num_tokens': 183628152.0, 'completions/mean_length': 5393.9140625, 'completions/min_length': 628.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5039.39501953125, 'completions/min_terminated_length': 628.0, 'completions/max_terminated_length': 16064.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.31694266200065613, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018448319286108017, 'sampling/sampling_logp_difference/max': 5.730112552642822, 'sampling/importance_sampling_ratio/min': 0.003246711567044258, 'sampling/importance_sampling_ratio/mean': 0.9998779892921448, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7864786610007286, 'clip_ratio/low_mean': 5.4809036328151706e-05, 'clip_ratio/low_min': 8.953898031904828e-06, 'clip_ratio/high_mean': 9.084843100026774e-06, 'clip_ratio/high_max': 3.2495465802639956e-05, 'clip_ratio/region_mean': 6.389387954186532e-05, 'epoch': 0.2}
+
+ 21%|██        | 217/1024 [9:44:54<37:24:16, 166.86s/it][AINFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 218/1024 [9:47:38<37:10:50, 166.07s/it][A
+                                                        [A{'loss': 0.0617, 'grad_norm': 0.003666195785626769, 'learning_rate': 1e-05, 'num_tokens': 184562352.0, 'completions/mean_length': 7161.5, 'completions/min_length': 681.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7015.111328125, 'completions/min_terminated_length': 681.0, 'completions/max_terminated_length': 15453.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019755780696868896, 'sampling/sampling_logp_difference/max': 8.272256851196289, 'sampling/importance_sampling_ratio/min': 0.00025550799909979105, 'sampling/importance_sampling_ratio/mean': 0.9999294281005859, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.915394201874733, 'clip_ratio/low_mean': 1.6896704778446292e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1596620172203984e-06, 'clip_ratio/high_max': 8.638648068881594e-06, 'clip_ratio/region_mean': 1.9056366909353528e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 218/1024 [9:47:38<37:10:50, 166.07s/it][AINFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 219/1024 [9:50:36<37:57:19, 169.74s/it][A
+                                                        [A{'loss': 0.032, 'grad_norm': 0.0025940234772861004, 'learning_rate': 1e-05, 'num_tokens': 185606670.0, 'completions/mean_length': 7957.671875, 'completions/min_length': 96.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7685.8544921875, 'completions/min_terminated_length': 96.0, 'completions/max_terminated_length': 15408.0, 'rewards/accuracy_reward/mean': 0.1171875, 'rewards/accuracy_reward/std': 0.322907418012619, 'reward': 0.1171875, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02338646724820137, 'sampling/sampling_logp_difference/max': 7.179195404052734, 'sampling/importance_sampling_ratio/min': 0.0007622809498570859, 'sampling/importance_sampling_ratio/mean': 0.999893844127655, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1176252663135529, 'clip_ratio/low_mean': 2.49038239417132e-05, 'clip_ratio/low_min': 4.00025601265952e-06, 'clip_ratio/high_mean': 1.6062328995758435e-06, 'clip_ratio/high_max': 6.424931598303374e-06, 'clip_ratio/region_mean': 2.651005689813246e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 219/1024 [9:50:36<37:57:19, 169.74s/it][AINFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-01 23:17:24,381 - math_verify.grader - WARNING - Timeout during comparison
+
+ 21%|██▏       | 220/1024 [9:53:24<37:47:36, 169.22s/it][A
+                                                        [A{'loss': 0.0607, 'grad_norm': 0.004315398633480072, 'learning_rate': 1e-05, 'num_tokens': 186526883.0, 'completions/mean_length': 7060.6640625, 'completions/min_length': 1460.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6759.9111328125, 'completions/min_terminated_length': 1460.0, 'completions/max_terminated_length': 16146.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01967843994498253, 'sampling/sampling_logp_difference/max': 7.687473297119141, 'sampling/importance_sampling_ratio/min': 0.0004585353017318994, 'sampling/importance_sampling_ratio/mean': 1.000004529953003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9148540124297142, 'clip_ratio/low_mean': 4.4742550926457625e-05, 'clip_ratio/low_min': 3.5803282116830815e-06, 'clip_ratio/high_mean': 5.829163114867697e-06, 'clip_ratio/high_max': 1.9903963220713194e-05, 'clip_ratio/region_mean': 5.057171370026481e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 220/1024 [9:53:24<37:47:36, 169.22s/it][AINFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 221/1024 [9:56:03<37:00:54, 165.95s/it][A
+                                                        [A{'loss': 0.0606, 'grad_norm': 0.0030786178540438414, 'learning_rate': 1e-05, 'num_tokens': 187397536.0, 'completions/mean_length': 6649.6640625, 'completions/min_length': 780.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6416.04052734375, 'completions/min_terminated_length': 780.0, 'completions/max_terminated_length': 15596.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020215414464473724, 'sampling/sampling_logp_difference/max': 14.929608345031738, 'sampling/importance_sampling_ratio/min': 3.2821125728332845e-07, 'sampling/importance_sampling_ratio/mean': 1.000005841255188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9298559054732323, 'clip_ratio/low_mean': 2.8967988555450574e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8677483214778476e-06, 'clip_ratio/high_max': 1.147099328591139e-05, 'clip_ratio/region_mean': 3.1835736763241584e-05, 'epoch': 0.2}
+
+ 22%|██▏       | 221/1024 [9:56:03<37:00:54, 165.95s/it][AINFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 222/1024 [9:59:07<38:13:40, 171.60s/it][A
+                                                        [A{'loss': 0.044, 'grad_norm': 0.002438523108139634, 'learning_rate': 1e-05, 'num_tokens': 188477778.0, 'completions/mean_length': 8292.015625, 'completions/min_length': 533.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7823.8837890625, 'completions/min_terminated_length': 533.0, 'completions/max_terminated_length': 16210.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018984414637088776, 'sampling/sampling_logp_difference/max': 5.178531169891357, 'sampling/importance_sampling_ratio/min': 0.005636279005557299, 'sampling/importance_sampling_ratio/mean': 1.0000240802764893, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8232023045420647, 'clip_ratio/low_mean': 3.249637484259438e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.692142735824746e-06, 'clip_ratio/high_max': 2.2768570943298982e-05, 'clip_ratio/region_mean': 3.8188517464732286e-05, 'epoch': 0.2}
+
+ 22%|██▏       | 222/1024 [9:59:07<38:13:40, 171.60s/it][AINFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 223/1024 [10:02:04<38:29:49, 173.02s/it][A
+                                                         [A{'loss': 0.0486, 'grad_norm': 0.004773247055709362, 'learning_rate': 1e-05, 'num_tokens': 189470655.0, 'completions/mean_length': 7600.9765625, 'completions/min_length': 995.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6936.71484375, 'completions/min_terminated_length': 995.0, 'completions/max_terminated_length': 15991.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3079911172389984, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018666012212634087, 'sampling/sampling_logp_difference/max': 6.624707221984863, 'sampling/importance_sampling_ratio/min': 0.001327168894931674, 'sampling/importance_sampling_ratio/mean': 0.9999308586120605, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8689917623996735, 'clip_ratio/low_mean': 2.255633432923787e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.135253556749376e-06, 'clip_ratio/high_max': 2.0840709566982696e-05, 'clip_ratio/region_mean': 2.869158777230041e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 223/1024 [10:02:04<38:29:49, 173.02s/it][AINFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 224/1024 [10:04:52<38:09:18, 171.70s/it][A
+                                                         [A{'loss': 0.145, 'grad_norm': 0.004298723768442869, 'learning_rate': 1e-05, 'num_tokens': 190462227.0, 'completions/mean_length': 7600.34375, 'completions/min_length': 1335.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6855.96630859375, 'completions/min_terminated_length': 1335.0, 'completions/max_terminated_length': 16215.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2919674217700958, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018035393208265305, 'sampling/sampling_logp_difference/max': 9.996363639831543, 'sampling/importance_sampling_ratio/min': 4.5565320760942996e-05, 'sampling/importance_sampling_ratio/mean': 0.9999310374259949, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7636929750442505, 'clip_ratio/low_mean': 6.463955219260242e-05, 'clip_ratio/low_min': 1.0895145351241808e-05, 'clip_ratio/high_mean': 2.459364736751013e-06, 'clip_ratio/high_max': 9.837458947004052e-06, 'clip_ratio/region_mean': 6.70989177251613e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 224/1024 [10:04:52<38:09:18, 171.70s/it][AINFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 225/1024 [10:07:43<38:01:54, 171.36s/it][A
+                                                         [A{'loss': 0.0859, 'grad_norm': 0.006741553544998169, 'learning_rate': 1e-05, 'num_tokens': 191312483.0, 'completions/mean_length': 6512.0, 'completions/min_length': 574.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6434.267578125, 'completions/min_terminated_length': 574.0, 'completions/max_terminated_length': 15151.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020878732204437256, 'sampling/sampling_logp_difference/max': 10.937172889709473, 'sampling/importance_sampling_ratio/min': 1.778468504198827e-05, 'sampling/importance_sampling_ratio/mean': 1.0000028610229492, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9043584689497948, 'clip_ratio/low_mean': 2.6516039497437305e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5151505812791584e-06, 'clip_ratio/high_max': 1.4060602325116633e-05, 'clip_ratio/region_mean': 3.003119024924672e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 225/1024 [10:07:43<38:01:54, 171.36s/it][AINFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 226/1024 [10:10:32<37:49:32, 170.64s/it][A
+                                                         [A{'loss': 0.0515, 'grad_norm': 0.00281486171297729, 'learning_rate': 1e-05, 'num_tokens': 192251235.0, 'completions/mean_length': 7178.6875, 'completions/min_length': 847.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6565.00048828125, 'completions/min_terminated_length': 847.0, 'completions/max_terminated_length': 16339.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2240736484527588, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020196784287691116, 'sampling/sampling_logp_difference/max': 9.314308166503906, 'sampling/importance_sampling_ratio/min': 9.012543159769848e-05, 'sampling/importance_sampling_ratio/mean': 0.9999714493751526, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8899475410580635, 'clip_ratio/low_mean': 2.8831826739406097e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.339021302257606e-06, 'clip_ratio/high_max': 1.7356085209030425e-05, 'clip_ratio/region_mean': 3.317084781429003e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 226/1024 [10:10:32<37:49:32, 170.64s/it][AINFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 227/1024 [10:13:05<36:36:25, 165.35s/it][A
+                                                         [A{'loss': 0.0781, 'grad_norm': 0.005070593673735857, 'learning_rate': 1e-05, 'num_tokens': 193116763.0, 'completions/mean_length': 6602.5625, 'completions/min_length': 927.0, 'completions/max_length': 15501.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6602.5625, 'completions/min_terminated_length': 927.0, 'completions/max_terminated_length': 15501.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020026464015245438, 'sampling/sampling_logp_difference/max': 12.812478065490723, 'sampling/importance_sampling_ratio/min': 2.726537559283315e-06, 'sampling/importance_sampling_ratio/mean': 0.9999746680259705, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9266818463802338, 'clip_ratio/low_mean': 3.0248688972278615e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.889521053679346e-06, 'clip_ratio/high_max': 1.5558084214717383e-05, 'clip_ratio/region_mean': 3.413820991227112e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 227/1024 [10:13:05<36:36:25, 165.35s/it][AINFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 228/1024 [10:15:43<36:06:16, 163.29s/it][A
+                                                         [A{'loss': 0.0637, 'grad_norm': 0.006362155079841614, 'learning_rate': 1e-05, 'num_tokens': 194007868.0, 'completions/mean_length': 6818.8828125, 'completions/min_length': 510.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6430.056640625, 'completions/min_terminated_length': 510.0, 'completions/max_terminated_length': 16046.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01943325623869896, 'sampling/sampling_logp_difference/max': 7.55847692489624, 'sampling/importance_sampling_ratio/min': 0.0005216691642999649, 'sampling/importance_sampling_ratio/mean': 1.000009298324585, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.874519519507885, 'clip_ratio/low_mean': 2.959152834591805e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.047181854119117e-06, 'clip_ratio/high_max': 4.188727416476468e-06, 'clip_ratio/region_mean': 3.063871008635033e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 228/1024 [10:15:43<36:06:16, 163.29s/it][AINFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 229/1024 [10:18:21<35:41:55, 161.65s/it][A
+                                                         [A{'loss': 0.1061, 'grad_norm': 0.003797185141593218, 'learning_rate': 1e-05, 'num_tokens': 194735980.0, 'completions/mean_length': 5515.625, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5343.111328125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 14536.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.34010058641433716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02120930328965187, 'sampling/sampling_logp_difference/max': 15.989612579345703, 'sampling/importance_sampling_ratio/min': 1.137102216830499e-07, 'sampling/importance_sampling_ratio/mean': 0.999911367893219, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0683523043990135, 'clip_ratio/low_mean': 6.821557258263056e-05, 'clip_ratio/low_min': 1.7265090718865395e-05, 'clip_ratio/high_mean': 2.4114777943395893e-06, 'clip_ratio/high_max': 9.645911177358357e-06, 'clip_ratio/region_mean': 7.062705049065698e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 229/1024 [10:18:21<35:41:55, 161.65s/it][AINFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 230/1024 [10:20:44<34:23:17, 155.92s/it][A
+                                                         [A{'loss': 0.0204, 'grad_norm': 0.004124365746974945, 'learning_rate': 1e-05, 'num_tokens': 195504882.0, 'completions/mean_length': 5853.546875, 'completions/min_length': 615.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5770.6298828125, 'completions/min_terminated_length': 615.0, 'completions/max_terminated_length': 14992.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3243142366409302, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017819223925471306, 'sampling/sampling_logp_difference/max': 5.717539310455322, 'sampling/importance_sampling_ratio/min': 0.0032877910416573286, 'sampling/importance_sampling_ratio/mean': 1.0000672340393066, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7975900694727898, 'clip_ratio/low_mean': 4.9151800567415194e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.4928530630604655e-06, 'clip_ratio/high_max': 2.1971412252241862e-05, 'clip_ratio/region_mean': 5.4644653801005916e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 230/1024 [10:20:44<34:23:17, 155.92s/it][AINFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 231/1024 [10:23:34<35:17:47, 160.24s/it][A
+                                                         [A{'loss': 0.0947, 'grad_norm': 0.0024995009880512953, 'learning_rate': 1e-05, 'num_tokens': 196379306.0, 'completions/mean_length': 6686.25, 'completions/min_length': 260.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6532.31787109375, 'completions/min_terminated_length': 260.0, 'completions/max_terminated_length': 15503.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.35824593901634216, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018989525735378265, 'sampling/sampling_logp_difference/max': 10.818918228149414, 'sampling/importance_sampling_ratio/min': 2.0017207134515047e-05, 'sampling/importance_sampling_ratio/mean': 0.9999300837516785, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9018580466508865, 'clip_ratio/low_mean': 5.1467116236381116e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.843255515472265e-06, 'clip_ratio/high_max': 7.066538728395244e-06, 'clip_ratio/region_mean': 5.431037175185338e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 231/1024 [10:23:34<35:17:47, 160.24s/it][AINFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 232/1024 [10:26:50<37:37:43, 171.04s/it][A
+                                                         [A{'loss': 0.0754, 'grad_norm': 0.004295211285352707, 'learning_rate': 1e-05, 'num_tokens': 197357397.0, 'completions/mean_length': 7487.3359375, 'completions/min_length': 1222.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7200.3466796875, 'completions/min_terminated_length': 1222.0, 'completions/max_terminated_length': 16347.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02209121733903885, 'sampling/sampling_logp_difference/max': 7.33111047744751, 'sampling/importance_sampling_ratio/min': 0.0006548459641635418, 'sampling/importance_sampling_ratio/mean': 1.000002384185791, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9890001565217972, 'clip_ratio/low_mean': 3.699686294567073e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5722979444253724e-06, 'clip_ratio/high_max': 6.652828687947476e-06, 'clip_ratio/region_mean': 3.95691608900961e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 232/1024 [10:26:50<37:37:43, 171.04s/it][AINFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 233/1024 [10:29:40<37:29:19, 170.62s/it][A
+                                                         [A{'loss': 0.0914, 'grad_norm': 0.003119673579931259, 'learning_rate': 1e-05, 'num_tokens': 198303795.0, 'completions/mean_length': 7233.484375, 'completions/min_length': 706.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6938.30615234375, 'completions/min_terminated_length': 706.0, 'completions/max_terminated_length': 15825.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.23014704883098602, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021085180342197418, 'sampling/sampling_logp_difference/max': 3.89424467086792, 'sampling/importance_sampling_ratio/min': 0.020358745008707047, 'sampling/importance_sampling_ratio/mean': 1.0000243186950684, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9683803990483284, 'clip_ratio/low_mean': 2.9443070673096372e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5212734751912649e-06, 'clip_ratio/high_max': 6.0850939007650595e-06, 'clip_ratio/region_mean': 3.0964344205131056e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 233/1024 [10:29:40<37:29:19, 170.62s/it][AINFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 234/1024 [10:32:10<36:06:22, 164.54s/it][A
+                                                         [A{'loss': 0.1028, 'grad_norm': 0.0033790848683565855, 'learning_rate': 1e-05, 'num_tokens': 199154735.0, 'completions/mean_length': 6457.78125, 'completions/min_length': 850.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6300.22265625, 'completions/min_terminated_length': 850.0, 'completions/max_terminated_length': 15733.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01950821653008461, 'sampling/sampling_logp_difference/max': 15.063070297241211, 'sampling/importance_sampling_ratio/min': 2.872048128210736e-07, 'sampling/importance_sampling_ratio/mean': 0.9998799562454224, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8881053999066353, 'clip_ratio/low_mean': 4.031422963635123e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9909530237782747e-06, 'clip_ratio/high_max': 7.963812095113099e-06, 'clip_ratio/region_mean': 4.23051826601295e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 234/1024 [10:32:10<36:06:22, 164.54s/it][AINFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 235/1024 [10:35:04<36:38:48, 167.21s/it][A
+                                                         [A{'loss': 0.0179, 'grad_norm': 0.0021492803934961557, 'learning_rate': 1e-05, 'num_tokens': 200185643.0, 'completions/mean_length': 7904.40625, 'completions/min_length': 1128.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7769.81005859375, 'completions/min_terminated_length': 1128.0, 'completions/max_terminated_length': 16318.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.1820138692855835, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021201875060796738, 'sampling/sampling_logp_difference/max': 6.530262470245361, 'sampling/importance_sampling_ratio/min': 0.001458622980862856, 'sampling/importance_sampling_ratio/mean': 1.0001094341278076, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9881557524204254, 'clip_ratio/low_mean': 2.2856192117615137e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3430123380639998e-06, 'clip_ratio/high_max': 9.059622016138746e-06, 'clip_ratio/region_mean': 2.6199204512522556e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 235/1024 [10:35:04<36:38:48, 167.21s/it][AINFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 236/1024 [10:37:50<36:34:08, 167.07s/it][A
+                                                         [A{'loss': 0.0327, 'grad_norm': 0.0037221095990389585, 'learning_rate': 1e-05, 'num_tokens': 201153114.0, 'completions/mean_length': 7414.4921875, 'completions/min_length': 949.0, 'completions/max_length': 15328.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7414.4921875, 'completions/min_terminated_length': 949.0, 'completions/max_terminated_length': 15328.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.248829185962677, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021356744691729546, 'sampling/sampling_logp_difference/max': 6.99871301651001, 'sampling/importance_sampling_ratio/min': 0.0009130563121289015, 'sampling/importance_sampling_ratio/mean': 0.9999958872795105, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9571134969592094, 'clip_ratio/low_mean': 3.018811844412994e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7490709751655231e-06, 'clip_ratio/high_max': 6.9962839006620925e-06, 'clip_ratio/region_mean': 3.193718976035598e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 236/1024 [10:37:50<36:34:08, 167.07s/it][AINFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 237/1024 [10:40:13<34:55:30, 159.76s/it][A
+                                                         [A{'loss': 0.0641, 'grad_norm': 0.006285305600613356, 'learning_rate': 1e-05, 'num_tokens': 201933044.0, 'completions/mean_length': 5955.953125, 'completions/min_length': 1394.0, 'completions/max_length': 15835.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5955.953125, 'completions/min_terminated_length': 1394.0, 'completions/max_terminated_length': 15835.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.31011277437210083, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016975615173578262, 'sampling/sampling_logp_difference/max': 4.888189792633057, 'sampling/importance_sampling_ratio/min': 0.007535050623118877, 'sampling/importance_sampling_ratio/mean': 0.9999420642852783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.730999618768692, 'clip_ratio/low_mean': 5.4354991334548686e-05, 'clip_ratio/low_min': 6.868132004456129e-06, 'clip_ratio/high_mean': 2.8120230126660317e-06, 'clip_ratio/high_max': 1.1248092050664127e-05, 'clip_ratio/region_mean': 5.716701480196207e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 237/1024 [10:40:13<34:55:30, 159.76s/it][AINFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 238/1024 [10:42:52<34:50:50, 159.61s/it][A
+                                                         [A{'loss': 0.0793, 'grad_norm': 0.005667983554303646, 'learning_rate': 1e-05, 'num_tokens': 202837281.0, 'completions/mean_length': 6923.3515625, 'completions/min_length': 63.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6458.0732421875, 'completions/min_terminated_length': 63.0, 'completions/max_terminated_length': 15959.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.26826781034469604, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022059854120016098, 'sampling/sampling_logp_difference/max': 10.402952194213867, 'sampling/importance_sampling_ratio/min': 3.0342773243319243e-05, 'sampling/importance_sampling_ratio/mean': 0.999980092048645, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9938417226076126, 'clip_ratio/low_mean': 4.66828214484849e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.094216481258627e-06, 'clip_ratio/high_max': 7.226686648209579e-06, 'clip_ratio/region_mean': 4.977703792974353e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 238/1024 [10:42:52<34:50:50, 159.61s/it][AINFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 239/1024 [10:45:18<33:52:37, 155.36s/it][A
+                                                         [A{'loss': 0.0299, 'grad_norm': 0.004052883945405483, 'learning_rate': 1e-05, 'num_tokens': 203614448.0, 'completions/mean_length': 5930.9296875, 'completions/min_length': 343.0, 'completions/max_length': 14726.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5930.9296875, 'completions/min_terminated_length': 343.0, 'completions/max_terminated_length': 14726.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018519222736358643, 'sampling/sampling_logp_difference/max': 8.79355239868164, 'sampling/importance_sampling_ratio/min': 0.00015170808183029294, 'sampling/importance_sampling_ratio/mean': 0.999989926815033, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8100385963916779, 'clip_ratio/low_mean': 4.239228087499214e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3079692280371091e-06, 'clip_ratio/high_max': 5.2318769121484365e-06, 'clip_ratio/region_mean': 4.3700250216716086e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 239/1024 [10:45:18<33:52:37, 155.36s/it][AINFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 240/1024 [10:48:38<36:46:06, 168.84s/it][A
+                                                         [A{'loss': 0.0269, 'grad_norm': 0.004494607914239168, 'learning_rate': 1e-05, 'num_tokens': 204518261.0, 'completions/mean_length': 6911.1015625, 'completions/min_length': 862.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6108.3134765625, 'completions/min_terminated_length': 862.0, 'completions/max_terminated_length': 14996.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.34033796191215515, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020527629181742668, 'sampling/sampling_logp_difference/max': 6.484711647033691, 'sampling/importance_sampling_ratio/min': 0.0015266009140759706, 'sampling/importance_sampling_ratio/mean': 0.9998886585235596, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9260227829217911, 'clip_ratio/low_mean': 5.500513248080097e-05, 'clip_ratio/low_min': 7.924934834591113e-06, 'clip_ratio/high_mean': 1.226307745127997e-06, 'clip_ratio/high_max': 4.905230980511988e-06, 'clip_ratio/region_mean': 5.6231440112242126e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 240/1024 [10:48:38<36:46:06, 168.84s/it][AINFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 241/1024 [10:51:19<36:10:38, 166.33s/it][A
+                                                         [A{'loss': 0.0108, 'grad_norm': 0.0029451537411659956, 'learning_rate': 1e-05, 'num_tokens': 205433843.0, 'completions/mean_length': 6972.921875, 'completions/min_length': 438.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6823.5400390625, 'completions/min_terminated_length': 438.0, 'completions/max_terminated_length': 14637.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02013089321553707, 'sampling/sampling_logp_difference/max': 10.53177547454834, 'sampling/importance_sampling_ratio/min': 2.6675223125494085e-05, 'sampling/importance_sampling_ratio/mean': 1.0000104904174805, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0095533654093742, 'clip_ratio/low_mean': 4.75325257411896e-05, 'clip_ratio/low_min': 3.599504680096288e-06, 'clip_ratio/high_mean': 2.073441009997623e-06, 'clip_ratio/high_max': 8.293764039990492e-06, 'clip_ratio/region_mean': 4.960596663750039e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 241/1024 [10:51:19<36:10:38, 166.33s/it][AINFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 242/1024 [10:54:04<36:03:14, 165.98s/it][A
+                                                         [A{'loss': 0.073, 'grad_norm': 0.003371767932549119, 'learning_rate': 1e-05, 'num_tokens': 206310296.0, 'completions/mean_length': 6706.6640625, 'completions/min_length': 892.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6313.2763671875, 'completions/min_terminated_length': 892.0, 'completions/max_terminated_length': 16103.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.3537652790546417, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019770190119743347, 'sampling/sampling_logp_difference/max': 10.431736946105957, 'sampling/importance_sampling_ratio/min': 2.948181463580113e-05, 'sampling/importance_sampling_ratio/mean': 0.9999367594718933, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8647518903017044, 'clip_ratio/low_mean': 3.86000854177837e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.382379150527413e-05, 'clip_ratio/high_max': 4.163383164268453e-05, 'clip_ratio/region_mean': 5.2423876240936806e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 242/1024 [10:54:04<36:03:14, 165.98s/it][AINFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 243/1024 [10:56:56<36:26:03, 167.94s/it][A
+                                                         [A{'loss': 0.0281, 'grad_norm': 0.0016336971893906593, 'learning_rate': 1e-05, 'num_tokens': 207210974.0, 'completions/mean_length': 6882.609375, 'completions/min_length': 1119.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6415.32763671875, 'completions/min_terminated_length': 1119.0, 'completions/max_terminated_length': 16136.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.15650184452533722, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02139991894364357, 'sampling/sampling_logp_difference/max': 6.624994277954102, 'sampling/importance_sampling_ratio/min': 0.0013267879839986563, 'sampling/importance_sampling_ratio/mean': 0.9999210834503174, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.013342760503292, 'clip_ratio/low_mean': 2.4946740381892596e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.4946740381892596e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 243/1024 [10:56:56<36:26:03, 167.94s/it][AINFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 244/1024 [10:59:10<34:10:30, 157.73s/it][A
+                                                         [A{'loss': 0.0542, 'grad_norm': 0.005036406684666872, 'learning_rate': 1e-05, 'num_tokens': 208021893.0, 'completions/mean_length': 6195.7421875, 'completions/min_length': 409.0, 'completions/max_length': 15203.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6195.7421875, 'completions/min_terminated_length': 409.0, 'completions/max_terminated_length': 15203.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3453505039215088, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018679853528738022, 'sampling/sampling_logp_difference/max': 5.512784957885742, 'sampling/importance_sampling_ratio/min': 0.0040348549373447895, 'sampling/importance_sampling_ratio/mean': 0.9999955892562866, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8448907434940338, 'clip_ratio/low_mean': 3.938925010515959e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7166009860811755e-06, 'clip_ratio/high_max': 1.4866403944324702e-05, 'clip_ratio/region_mean': 4.310585177336179e-05, 'epoch': 0.22}
+
+ 24%|██▍       | 244/1024 [10:59:10<34:10:30, 157.73s/it][AINFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 245/1024 [11:01:51<34:20:13, 158.68s/it][A
+                                                         [A{'loss': 0.0907, 'grad_norm': 0.0029643685556948185, 'learning_rate': 1e-05, 'num_tokens': 208912059.0, 'completions/mean_length': 6829.609375, 'completions/min_length': 735.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6521.40283203125, 'completions/min_terminated_length': 735.0, 'completions/max_terminated_length': 16305.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3079911172389984, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018488366156816483, 'sampling/sampling_logp_difference/max': 7.873661994934082, 'sampling/importance_sampling_ratio/min': 0.00038063788088038564, 'sampling/importance_sampling_ratio/mean': 0.9999761581420898, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8679579794406891, 'clip_ratio/low_mean': 3.422392001084518e-05, 'clip_ratio/low_min': 6.451612989621935e-06, 'clip_ratio/high_mean': 2.811220838339068e-06, 'clip_ratio/high_max': 1.1244883353356272e-05, 'clip_ratio/region_mean': 3.703514119024476e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 245/1024 [11:01:51<34:20:13, 158.68s/it][AINFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 246/1024 [11:04:10<33:00:12, 152.71s/it][A
+                                                         [A{'loss': 0.0932, 'grad_norm': 0.0035942886024713516, 'learning_rate': 1e-05, 'num_tokens': 209627804.0, 'completions/mean_length': 5444.4453125, 'completions/min_length': 575.0, 'completions/max_length': 14503.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5444.4453125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 14503.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.338498055934906, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020146891474723816, 'sampling/sampling_logp_difference/max': 3.4484035968780518, 'sampling/importance_sampling_ratio/min': 0.03179635480046272, 'sampling/importance_sampling_ratio/mean': 0.99997478723526, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0460086688399315, 'clip_ratio/low_mean': 3.138338854569156e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.675150077877333e-06, 'clip_ratio/high_max': 2.2700600311509334e-05, 'clip_ratio/region_mean': 3.705853873725573e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 246/1024 [11:04:10<33:00:12, 152.71s/it][AINFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 247/1024 [11:07:06<34:27:59, 159.69s/it][A
+                                                         [A{'loss': 0.0699, 'grad_norm': 0.0044983453117311, 'learning_rate': 1e-05, 'num_tokens': 210630150.0, 'completions/mean_length': 7657.390625, 'completions/min_length': 1048.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7152.544921875, 'completions/min_terminated_length': 1048.0, 'completions/max_terminated_length': 16244.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02131088823080063, 'sampling/sampling_logp_difference/max': 10.158285140991211, 'sampling/importance_sampling_ratio/min': 3.8753667467972264e-05, 'sampling/importance_sampling_ratio/mean': 1.0000007152557373, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9528728649020195, 'clip_ratio/low_mean': 5.265122354103369e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.552578502625693e-06, 'clip_ratio/high_max': 1.477029400120955e-05, 'clip_ratio/region_mean': 5.720380158891203e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 247/1024 [11:07:06<34:27:59, 159.69s/it][AINFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 248/1024 [11:09:55<35:01:14, 162.47s/it][A
+                                                         [A{'loss': 0.0566, 'grad_norm': 0.006095650140196085, 'learning_rate': 1e-05, 'num_tokens': 211620355.0, 'completions/mean_length': 7574.3515625, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7504.984375, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 16284.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021727774292230606, 'sampling/sampling_logp_difference/max': 6.575083255767822, 'sampling/importance_sampling_ratio/min': 0.0013946897815912962, 'sampling/importance_sampling_ratio/mean': 1.0000433921813965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0009776800870895, 'clip_ratio/low_mean': 2.2759413695894182e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.151910678094282e-06, 'clip_ratio/high_max': 8.607642712377128e-06, 'clip_ratio/region_mean': 2.491132454451872e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 248/1024 [11:09:55<35:01:14, 162.47s/it][AINFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 249/1024 [11:12:51<35:51:41, 166.58s/it][A
+                                                         [A{'loss': 0.0364, 'grad_norm': 0.0037038614973425865, 'learning_rate': 1e-05, 'num_tokens': 212654747.0, 'completions/mean_length': 7919.6875, 'completions/min_length': 1517.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7716.54443359375, 'completions/min_terminated_length': 1517.0, 'completions/max_terminated_length': 14915.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.022051017731428146, 'sampling/sampling_logp_difference/max': 5.157684326171875, 'sampling/importance_sampling_ratio/min': 0.0057550109922885895, 'sampling/importance_sampling_ratio/mean': 0.9999381899833679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0405654236674309, 'clip_ratio/low_mean': 5.936152001595474e-05, 'clip_ratio/low_min': 9.155588486464694e-06, 'clip_ratio/high_mean': 5.141430960975413e-06, 'clip_ratio/high_max': 1.764823082339717e-05, 'clip_ratio/region_mean': 6.450295177273802e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 249/1024 [11:12:51<35:51:41, 166.58s/it][AINFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 250/1024 [11:16:22<38:42:46, 180.06s/it][A
+                                                         [A{'loss': 0.0571, 'grad_norm': 0.00325607368722558, 'learning_rate': 1e-05, 'num_tokens': 213774584.0, 'completions/mean_length': 8613.4765625, 'completions/min_length': 694.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7735.0693359375, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 16122.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.33668074011802673, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020002499222755432, 'sampling/sampling_logp_difference/max': 10.999996185302734, 'sampling/importance_sampling_ratio/min': 1.670176425250247e-05, 'sampling/importance_sampling_ratio/mean': 1.000060796737671, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.890489287674427, 'clip_ratio/low_mean': 4.716233138424286e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1631356023353874e-06, 'clip_ratio/high_max': 1.265254240934155e-05, 'clip_ratio/region_mean': 5.032546687289141e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 250/1024 [11:16:22<38:42:46, 180.06s/it][AINFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 251/1024 [11:19:29<39:03:49, 181.93s/it][A
+                                                         [A{'loss': 0.0717, 'grad_norm': 0.0038265211042016745, 'learning_rate': 1e-05, 'num_tokens': 214728371.0, 'completions/mean_length': 7324.8984375, 'completions/min_length': 704.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6473.1884765625, 'completions/min_terminated_length': 704.0, 'completions/max_terminated_length': 16022.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.32719239592552185, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018367979675531387, 'sampling/sampling_logp_difference/max': 8.095518112182617, 'sampling/importance_sampling_ratio/min': 0.0003049026126973331, 'sampling/importance_sampling_ratio/mean': 1.0000168085098267, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.761004202067852, 'clip_ratio/low_mean': 3.880500707964529e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.101151375834888e-06, 'clip_ratio/high_max': 1.6404605503339553e-05, 'clip_ratio/region_mean': 4.2906158682853857e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 251/1024 [11:19:29<39:03:49, 181.93s/it][AINFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 252/1024 [11:21:50<36:22:09, 169.60s/it][A
+                                                         [A{'loss': 0.0158, 'grad_norm': 0.002729539293795824, 'learning_rate': 1e-05, 'num_tokens': 215570806.0, 'completions/mean_length': 6422.0859375, 'completions/min_length': 373.0, 'completions/max_length': 14167.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6422.0859375, 'completions/min_terminated_length': 373.0, 'completions/max_terminated_length': 14167.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.25620076060295105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021903935819864273, 'sampling/sampling_logp_difference/max': 3.637866497039795, 'sampling/importance_sampling_ratio/min': 0.026308411732316017, 'sampling/importance_sampling_ratio/mean': 0.9999935030937195, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9946094751358032, 'clip_ratio/low_mean': 3.6433707123251224e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4061374713492114e-06, 'clip_ratio/high_max': 5.624549885396846e-06, 'clip_ratio/region_mean': 3.7839844594600436e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 252/1024 [11:21:50<36:22:09, 169.60s/it][AINFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 253/1024 [11:24:38<36:13:06, 169.11s/it][A
+                                                         [A{'loss': 0.1011, 'grad_norm': 0.004974282346665859, 'learning_rate': 1e-05, 'num_tokens': 216465635.0, 'completions/mean_length': 6845.2890625, 'completions/min_length': 1252.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6693.88134765625, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 15585.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.30061954259872437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019389234483242035, 'sampling/sampling_logp_difference/max': 9.343890190124512, 'sampling/importance_sampling_ratio/min': 8.749838889343664e-05, 'sampling/importance_sampling_ratio/mean': 1.0000090599060059, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8822609707713127, 'clip_ratio/low_mean': 3.17277934982485e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8094962115355884e-06, 'clip_ratio/high_max': 7.2379848461423535e-06, 'clip_ratio/region_mean': 3.353728982347093e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 253/1024 [11:24:38<36:13:06, 169.11s/it][AINFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 254/1024 [11:27:43<37:12:25, 173.96s/it][A
+                                                         [A{'loss': 0.0538, 'grad_norm': 0.0033159854356199503, 'learning_rate': 1e-05, 'num_tokens': 217485089.0, 'completions/mean_length': 7805.484375, 'completions/min_length': 435.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7528.7578125, 'completions/min_terminated_length': 435.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.33114904165267944, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021925684064626694, 'sampling/sampling_logp_difference/max': 9.437499046325684, 'sampling/importance_sampling_ratio/min': 7.967943383846432e-05, 'sampling/importance_sampling_ratio/mean': 0.9999412298202515, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9977599084377289, 'clip_ratio/low_mean': 4.096964960353944e-05, 'clip_ratio/low_min': 1.7403560605089297e-05, 'clip_ratio/high_mean': 3.9648204506193e-06, 'clip_ratio/high_max': 1.58592818024772e-05, 'clip_ratio/region_mean': 4.49344687467601e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 254/1024 [11:27:43<37:12:25, 173.96s/it][AINFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 255/1024 [11:30:41<37:27:39, 175.37s/it][A
+                                                         [A{'loss': 0.0775, 'grad_norm': 0.0034952745772898197, 'learning_rate': 1e-05, 'num_tokens': 218496040.0, 'completions/mean_length': 7737.5546875, 'completions/min_length': 713.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7530.04052734375, 'completions/min_terminated_length': 713.0, 'completions/max_terminated_length': 15681.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3085102438926697, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019742710515856743, 'sampling/sampling_logp_difference/max': 9.606889724731445, 'sampling/importance_sampling_ratio/min': 6.726370338583365e-05, 'sampling/importance_sampling_ratio/mean': 0.9999128580093384, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8667014688253403, 'clip_ratio/low_mean': 4.044636898470344e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.566349389278912e-06, 'clip_ratio/high_max': 1.8265397557115648e-05, 'clip_ratio/region_mean': 4.5012717691861326e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 255/1024 [11:30:41<37:27:39, 175.37s/it][AINFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 256/1024 [11:33:39<37:32:41, 175.99s/it][A
+                                                         [A{'loss': 0.0667, 'grad_norm': 0.0038676802068948746, 'learning_rate': 1e-05, 'num_tokens': 219459140.0, 'completions/mean_length': 7365.84375, 'completions/min_length': 744.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6601.59326171875, 'completions/min_terminated_length': 744.0, 'completions/max_terminated_length': 15858.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018882082775235176, 'sampling/sampling_logp_difference/max': 8.360733985900879, 'sampling/importance_sampling_ratio/min': 0.00023387260443996638, 'sampling/importance_sampling_ratio/mean': 0.9999598264694214, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8151945173740387, 'clip_ratio/low_mean': 3.204250072030845e-05, 'clip_ratio/low_min': 3.323495775475749e-06, 'clip_ratio/high_mean': 2.0610737010429148e-06, 'clip_ratio/high_max': 8.244294804171659e-06, 'clip_ratio/region_mean': 3.410357436450795e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 256/1024 [11:33:39<37:32:41, 175.99s/it][AINFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 257/1024 [11:36:16<36:16:30, 170.26s/it][A
+                                                         [A{'loss': 0.1082, 'grad_norm': 0.004310046322643757, 'learning_rate': 1e-05, 'num_tokens': 220304605.0, 'completions/mean_length': 6448.0078125, 'completions/min_length': 1128.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6369.771484375, 'completions/min_terminated_length': 1128.0, 'completions/max_terminated_length': 14556.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.35611939430236816, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020253397524356842, 'sampling/sampling_logp_difference/max': 8.99997615814209, 'sampling/importance_sampling_ratio/min': 0.0001234127557836473, 'sampling/importance_sampling_ratio/mean': 0.9999396800994873, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9546648040413857, 'clip_ratio/low_mean': 5.435333650893881e-05, 'clip_ratio/low_min': 5.33937054569833e-06, 'clip_ratio/high_mean': 2.9462287329806713e-06, 'clip_ratio/high_max': 6.87833608026267e-06, 'clip_ratio/region_mean': 5.729956546929316e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 257/1024 [11:36:16<36:16:30, 170.26s/it][AINFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 258/1024 [11:39:35<38:05:11, 179.00s/it][A
+                                                         [A{'loss': 0.042, 'grad_norm': 0.0026646999176591635, 'learning_rate': 1e-05, 'num_tokens': 221281968.0, 'completions/mean_length': 7457.6484375, 'completions/min_length': 604.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6941.24755859375, 'completions/min_terminated_length': 604.0, 'completions/max_terminated_length': 16037.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2012200653553009, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019208962097764015, 'sampling/sampling_logp_difference/max': 12.749988555908203, 'sampling/importance_sampling_ratio/min': 2.902353571698768e-06, 'sampling/importance_sampling_ratio/mean': 0.9999173283576965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8182889074087143, 'clip_ratio/low_mean': 2.5416685957679874e-05, 'clip_ratio/low_min': 5.5736391004757024e-06, 'clip_ratio/high_mean': 1.5490235227844096e-06, 'clip_ratio/high_max': 6.196094091137638e-06, 'clip_ratio/region_mean': 2.696570959415112e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 258/1024 [11:39:35<38:05:11, 179.00s/it][AINFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 259/1024 [11:42:39<38:19:18, 180.34s/it][A
+                                                         [A{'loss': 0.0131, 'grad_norm': 0.0016026750672608614, 'learning_rate': 1e-05, 'num_tokens': 222399046.0, 'completions/mean_length': 8561.109375, 'completions/min_length': 558.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7969.79052734375, 'completions/min_terminated_length': 558.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02173236384987831, 'sampling/sampling_logp_difference/max': 13.312499046325684, 'sampling/importance_sampling_ratio/min': 1.653693971093162e-06, 'sampling/importance_sampling_ratio/mean': 1.000004529953003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9581378549337387, 'clip_ratio/low_mean': 3.127787306311802e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.047383754368639e-06, 'clip_ratio/high_max': 1.6189535017474554e-05, 'clip_ratio/region_mean': 3.532525670379982e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 259/1024 [11:42:39<38:19:18, 180.34s/it][AINFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 260/1024 [11:45:47<38:46:26, 182.70s/it][A
+                                                         [A{'loss': 0.0845, 'grad_norm': 0.005460259038954973, 'learning_rate': 1e-05, 'num_tokens': 223335010.0, 'completions/mean_length': 7152.34375, 'completions/min_length': 130.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7079.6533203125, 'completions/min_terminated_length': 130.0, 'completions/max_terminated_length': 16239.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3356297016143799, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01986619457602501, 'sampling/sampling_logp_difference/max': 4.589165210723877, 'sampling/importance_sampling_ratio/min': 0.010161337442696095, 'sampling/importance_sampling_ratio/mean': 0.9999966621398926, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9052041247487068, 'clip_ratio/low_mean': 5.2955770115659107e-05, 'clip_ratio/low_min': 3.402656830076012e-06, 'clip_ratio/high_mean': 4.3255887476334465e-06, 'clip_ratio/high_max': 1.4200771602190798e-05, 'clip_ratio/region_mean': 5.7281358749605715e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 260/1024 [11:45:47<38:46:26, 182.70s/it][AINFO 12-02 01:10:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:10:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:10:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:10:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 261/1024 [11:48:22<36:58:10, 174.43s/it][A
+                                                         [A{'loss': 0.0966, 'grad_norm': 0.005933742038905621, 'learning_rate': 1e-05, 'num_tokens': 224207006.0, 'completions/mean_length': 6678.65625, 'completions/min_length': 963.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6524.603515625, 'completions/min_terminated_length': 963.0, 'completions/max_terminated_length': 15631.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3316681981086731, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019827336072921753, 'sampling/sampling_logp_difference/max': 6.747769355773926, 'sampling/importance_sampling_ratio/min': 0.0011734943836927414, 'sampling/importance_sampling_ratio/mean': 1.000031590461731, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9043187350034714, 'clip_ratio/low_mean': 3.81288905373367e-05, 'clip_ratio/low_min': 8.099272235995159e-06, 'clip_ratio/high_mean': 3.5875787034456152e-06, 'clip_ratio/high_max': 1.4350314813782461e-05, 'clip_ratio/region_mean': 4.1716469809216505e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 261/1024 [11:48:22<36:58:10, 174.43s/it][AINFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 262/1024 [11:51:06<36:15:39, 171.31s/it][A
+                                                         [A{'loss': 0.104, 'grad_norm': 0.003635740838944912, 'learning_rate': 1e-05, 'num_tokens': 225122891.0, 'completions/mean_length': 6999.0390625, 'completions/min_length': 990.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6850.07177734375, 'completions/min_terminated_length': 990.0, 'completions/max_terminated_length': 15972.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.323777437210083, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018912551924586296, 'sampling/sampling_logp_difference/max': 10.987512588500977, 'sampling/importance_sampling_ratio/min': 1.6911570128286257e-05, 'sampling/importance_sampling_ratio/mean': 0.9999303817749023, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8109970837831497, 'clip_ratio/low_mean': 3.601791678420341e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.124704844343796e-06, 'clip_ratio/high_max': 1.6498819377375185e-05, 'clip_ratio/region_mean': 4.014262168539062e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 262/1024 [11:51:06<36:15:39, 171.31s/it][AINFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 263/1024 [11:53:56<36:07:50, 170.92s/it][A
+                                                         [A{'loss': 0.0458, 'grad_norm': 0.003405241761356592, 'learning_rate': 1e-05, 'num_tokens': 226102462.0, 'completions/mean_length': 7483.7109375, 'completions/min_length': 1153.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7045.9912109375, 'completions/min_terminated_length': 1153.0, 'completions/max_terminated_length': 15713.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3022220730781555, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021076779812574387, 'sampling/sampling_logp_difference/max': 5.249300479888916, 'sampling/importance_sampling_ratio/min': 0.00525119062513113, 'sampling/importance_sampling_ratio/mean': 1.00002920627594, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9473970532417297, 'clip_ratio/low_mean': 3.766565987461945e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3818944896447647e-06, 'clip_ratio/high_max': 9.527577958579059e-06, 'clip_ratio/region_mean': 4.004755419373396e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 263/1024 [11:53:56<36:07:50, 170.92s/it][AINFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 264/1024 [11:56:45<35:58:50, 170.43s/it][A
+                                                         [A{'loss': 0.0801, 'grad_norm': 0.0025927501264959574, 'learning_rate': 1e-05, 'num_tokens': 227093562.0, 'completions/mean_length': 7569.03125, 'completions/min_length': 893.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7357.47216796875, 'completions/min_terminated_length': 893.0, 'completions/max_terminated_length': 16256.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.19097033143043518, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020578444004058838, 'sampling/sampling_logp_difference/max': 5.249953269958496, 'sampling/importance_sampling_ratio/min': 0.0052477638237178326, 'sampling/importance_sampling_ratio/mean': 0.9999816417694092, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9231455475091934, 'clip_ratio/low_mean': 3.8259706570897833e-05, 'clip_ratio/low_min': 3.549019083948224e-06, 'clip_ratio/high_mean': 3.966830490753637e-06, 'clip_ratio/high_max': 1.5867321963014547e-05, 'clip_ratio/region_mean': 4.2226537743772496e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 264/1024 [11:56:45<35:58:50, 170.43s/it][AINFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 265/1024 [11:59:44<36:28:05, 172.97s/it][A
+                                                         [A{'loss': 0.04, 'grad_norm': 0.0030512227676808834, 'learning_rate': 1e-05, 'num_tokens': 228086405.0, 'completions/mean_length': 7589.2734375, 'completions/min_length': 130.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7378.2001953125, 'completions/min_terminated_length': 130.0, 'completions/max_terminated_length': 15819.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.27905434370040894, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020208362489938736, 'sampling/sampling_logp_difference/max': 8.437499046325684, 'sampling/importance_sampling_ratio/min': 0.0002165911573683843, 'sampling/importance_sampling_ratio/mean': 1.000004529953003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9265239909291267, 'clip_ratio/low_mean': 4.253613235505327e-05, 'clip_ratio/low_min': 3.5579084851633525e-06, 'clip_ratio/high_mean': 3.36022765168309e-06, 'clip_ratio/high_max': 1.344091060673236e-05, 'clip_ratio/region_mean': 4.5896360120423196e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 265/1024 [11:59:44<36:28:05, 172.97s/it][AINFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 266/1024 [12:02:55<37:32:23, 178.29s/it][A
+                                                         [A{'loss': 0.0444, 'grad_norm': 0.0022430522367358208, 'learning_rate': 1e-05, 'num_tokens': 229183765.0, 'completions/mean_length': 8420.6875, 'completions/min_length': 1114.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8096.97509765625, 'completions/min_terminated_length': 1114.0, 'completions/max_terminated_length': 16275.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.309583842754364, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021570362150669098, 'sampling/sampling_logp_difference/max': 8.121989250183105, 'sampling/importance_sampling_ratio/min': 0.00029693738906644285, 'sampling/importance_sampling_ratio/mean': 0.9999421834945679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9572964608669281, 'clip_ratio/low_mean': 3.184792547017423e-05, 'clip_ratio/low_min': 7.29296516510658e-06, 'clip_ratio/high_mean': 4.903381352505676e-06, 'clip_ratio/high_max': 1.9613525410022703e-05, 'clip_ratio/region_mean': 3.675130722058384e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 266/1024 [12:02:55<37:32:23, 178.29s/it][AINFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 267/1024 [12:05:29<35:57:12, 170.98s/it][A
+                                                         [A{'loss': 0.1058, 'grad_norm': 0.004295065999031067, 'learning_rate': 1e-05, 'num_tokens': 230077607.0, 'completions/mean_length': 6809.765625, 'completions/min_length': 860.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6579.984375, 'completions/min_terminated_length': 860.0, 'completions/max_terminated_length': 15736.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.20251333713531494, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019895706325769424, 'sampling/sampling_logp_difference/max': 4.886721134185791, 'sampling/importance_sampling_ratio/min': 0.00754612497985363, 'sampling/importance_sampling_ratio/mean': 0.9999294281005859, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.884086549282074, 'clip_ratio/low_mean': 2.1682553096979973e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6821876442918438e-06, 'clip_ratio/high_max': 6.728750577167375e-06, 'clip_ratio/region_mean': 2.336474062758498e-05, 'epoch': 0.25}
+
+ 26%|██▌       | 267/1024 [12:05:29<35:57:12, 170.98s/it][AINFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 268/1024 [12:08:25<36:14:27, 172.58s/it][A
+                                                         [A{'loss': 0.0096, 'grad_norm': 0.004631794057786465, 'learning_rate': 1e-05, 'num_tokens': 231035616.0, 'completions/mean_length': 7340.6953125, 'completions/min_length': 1616.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6973.0810546875, 'completions/min_terminated_length': 1616.0, 'completions/max_terminated_length': 15080.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3235401213169098, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020591016858816147, 'sampling/sampling_logp_difference/max': 8.290475845336914, 'sampling/importance_sampling_ratio/min': 0.0002508950710762292, 'sampling/importance_sampling_ratio/mean': 0.9999337792396545, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9920620769262314, 'clip_ratio/low_mean': 5.158006410965754e-05, 'clip_ratio/low_min': 5.210069957684027e-06, 'clip_ratio/high_mean': 7.152336877425114e-06, 'clip_ratio/high_max': 2.8609347509700456e-05, 'clip_ratio/region_mean': 5.873240070286556e-05, 'epoch': 0.25}
+
+ 26%|██▌       | 268/1024 [12:08:25<36:14:27, 172.58s/it][AINFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 269/1024 [12:11:07<35:32:52, 169.50s/it][A
+                                                         [A{'loss': 0.0455, 'grad_norm': 0.0035752104595303535, 'learning_rate': 1e-05, 'num_tokens': 231920056.0, 'completions/mean_length': 6748.875, 'completions/min_length': 1169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6595.93701171875, 'completions/min_terminated_length': 1169.0, 'completions/max_terminated_length': 14120.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02061416581273079, 'sampling/sampling_logp_difference/max': 7.8571391105651855, 'sampling/importance_sampling_ratio/min': 0.0003869794018100947, 'sampling/importance_sampling_ratio/mean': 0.9999653100967407, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9867061004042625, 'clip_ratio/low_mean': 4.3085940774290066e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.3085940774290066e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 269/1024 [12:11:07<35:32:52, 169.50s/it][AINFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 270/1024 [12:13:48<34:54:45, 166.69s/it][A
+                                                         [A{'loss': 0.0711, 'grad_norm': 0.0036644963547587395, 'learning_rate': 1e-05, 'num_tokens': 232869159.0, 'completions/mean_length': 7260.3046875, 'completions/min_length': 1384.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7188.46435546875, 'completions/min_terminated_length': 1384.0, 'completions/max_terminated_length': 15706.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2359209954738617, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02120530977845192, 'sampling/sampling_logp_difference/max': 7.051599502563477, 'sampling/importance_sampling_ratio/min': 0.0008660226594656706, 'sampling/importance_sampling_ratio/mean': 0.9999546408653259, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0388494208455086, 'clip_ratio/low_mean': 3.10397430212106e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1266876021618373e-06, 'clip_ratio/high_max': 1.2506750408647349e-05, 'clip_ratio/region_mean': 3.416643085074611e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 270/1024 [12:13:48<34:54:45, 166.69s/it][AINFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 271/1024 [12:16:25<34:16:39, 163.88s/it][A
+                                                         [A{'loss': 0.0039, 'grad_norm': 0.004709267523139715, 'learning_rate': 1e-05, 'num_tokens': 233702842.0, 'completions/mean_length': 6354.4609375, 'completions/min_length': 1035.0, 'completions/max_length': 16073.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6354.4609375, 'completions/min_terminated_length': 1035.0, 'completions/max_terminated_length': 16073.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.3214184641838074, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019126038998365402, 'sampling/sampling_logp_difference/max': 5.37499475479126, 'sampling/importance_sampling_ratio/min': 0.0046309432946145535, 'sampling/importance_sampling_ratio/mean': 0.9999738931655884, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8405331820249557, 'clip_ratio/low_mean': 3.1861192269388994e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.760888254575548e-06, 'clip_ratio/high_max': 2.704355301830219e-05, 'clip_ratio/region_mean': 3.862208097871189e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 271/1024 [12:16:25<34:16:39, 163.88s/it][AINFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 272/1024 [12:19:12<34:27:03, 164.93s/it][A
+                                                         [A{'loss': 0.0757, 'grad_norm': 0.003066045930609107, 'learning_rate': 1e-05, 'num_tokens': 234556348.0, 'completions/mean_length': 6514.578125, 'completions/min_length': 982.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6357.9208984375, 'completions/min_terminated_length': 982.0, 'completions/max_terminated_length': 16026.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019960148259997368, 'sampling/sampling_logp_difference/max': 5.257136344909668, 'sampling/importance_sampling_ratio/min': 0.005210204049944878, 'sampling/importance_sampling_ratio/mean': 0.9999805092811584, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0254098922014236, 'clip_ratio/low_mean': 3.855073941849696e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.437307159652846e-06, 'clip_ratio/high_max': 9.749228638611385e-06, 'clip_ratio/region_mean': 4.098804652130639e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 272/1024 [12:19:12<34:27:03, 164.93s/it][AINFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 273/1024 [12:22:06<34:58:24, 167.65s/it][A
+                                                         [A{'loss': 0.062, 'grad_norm': 0.005132520105689764, 'learning_rate': 1e-05, 'num_tokens': 235521091.0, 'completions/mean_length': 7379.5546875, 'completions/min_length': 701.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7236.62744140625, 'completions/min_terminated_length': 701.0, 'completions/max_terminated_length': 15894.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2301519364118576, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021417103707790375, 'sampling/sampling_logp_difference/max': 8.699974060058594, 'sampling/importance_sampling_ratio/min': 0.00016659013635944575, 'sampling/importance_sampling_ratio/mean': 0.9999256134033203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0397320613265038, 'clip_ratio/low_mean': 3.487835761006863e-05, 'clip_ratio/low_min': 2.9392399483185727e-06, 'clip_ratio/high_mean': 2.6189534310105955e-06, 'clip_ratio/high_max': 1.0475813724042382e-05, 'clip_ratio/region_mean': 3.749731081370555e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 273/1024 [12:22:06<34:58:24, 167.65s/it][AINFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 274/1024 [12:24:58<35:11:27, 168.92s/it][A
+                                                         [A{'loss': 0.0143, 'grad_norm': 0.0028969801496714354, 'learning_rate': 1e-05, 'num_tokens': 236544160.0, 'completions/mean_length': 7837.1640625, 'completions/min_length': 1346.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7632.04052734375, 'completions/min_terminated_length': 1346.0, 'completions/max_terminated_length': 14565.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.29378965497016907, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019267702475190163, 'sampling/sampling_logp_difference/max': 15.059157371520996, 'sampling/importance_sampling_ratio/min': 2.883308241052873e-07, 'sampling/importance_sampling_ratio/mean': 0.9999887943267822, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8400963917374611, 'clip_ratio/low_mean': 2.6659268655748747e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.776861314643611e-06, 'clip_ratio/high_max': 1.9904123973901733e-05, 'clip_ratio/region_mean': 3.2436129686175263e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 274/1024 [12:24:58<35:11:27, 168.92s/it][AINFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 275/1024 [12:27:26<33:48:40, 162.51s/it][A
+                                                         [A{'loss': 0.0803, 'grad_norm': 0.003412836929783225, 'learning_rate': 1e-05, 'num_tokens': 237423101.0, 'completions/mean_length': 6696.3515625, 'completions/min_length': 1239.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6542.57958984375, 'completions/min_terminated_length': 1239.0, 'completions/max_terminated_length': 15350.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.37981897592544556, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018458625301718712, 'sampling/sampling_logp_difference/max': 4.410195827484131, 'sampling/importance_sampling_ratio/min': 0.012152798473834991, 'sampling/importance_sampling_ratio/mean': 1.0000269412994385, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8495818004012108, 'clip_ratio/low_mean': 4.060094340729847e-05, 'clip_ratio/low_min': 3.8700886761944275e-06, 'clip_ratio/high_mean': 2.1406925725386827e-06, 'clip_ratio/high_max': 8.562770290154731e-06, 'clip_ratio/region_mean': 4.2741635979837156e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 275/1024 [12:27:26<33:48:40, 162.51s/it][AINFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 276/1024 [12:30:41<35:48:44, 172.36s/it][A
+                                                         [A{'loss': 0.0604, 'grad_norm': 0.0024443145375698805, 'learning_rate': 1e-05, 'num_tokens': 238429956.0, 'completions/mean_length': 7700.3671875, 'completions/min_length': 844.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7121.45849609375, 'completions/min_terminated_length': 844.0, 'completions/max_terminated_length': 15666.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2872493863105774, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019427984952926636, 'sampling/sampling_logp_difference/max': 8.250510215759277, 'sampling/importance_sampling_ratio/min': 0.00026112530031241477, 'sampling/importance_sampling_ratio/mean': 0.9999113082885742, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8258870914578438, 'clip_ratio/low_mean': 6.144847083078275e-05, 'clip_ratio/low_min': 1.110105540647055e-05, 'clip_ratio/high_mean': 3.646129641765583e-06, 'clip_ratio/high_max': 1.1463653436294408e-05, 'clip_ratio/region_mean': 6.509460160941671e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 276/1024 [12:30:41<35:48:44, 172.36s/it][AINFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 277/1024 [12:33:18<34:48:07, 167.72s/it][A
+                                                         [A{'loss': 0.0222, 'grad_norm': 0.0022747826296836138, 'learning_rate': 1e-05, 'num_tokens': 239250160.0, 'completions/mean_length': 6255.21875, 'completions/min_length': 793.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6094.44482421875, 'completions/min_terminated_length': 793.0, 'completions/max_terminated_length': 16112.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018723051995038986, 'sampling/sampling_logp_difference/max': 8.241846084594727, 'sampling/importance_sampling_ratio/min': 0.0002633975527714938, 'sampling/importance_sampling_ratio/mean': 0.9999998807907104, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8179014846682549, 'clip_ratio/low_mean': 1.7289162997258245e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0545319355514948e-06, 'clip_ratio/high_max': 4.218127742205979e-06, 'clip_ratio/region_mean': 1.834369493280974e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 277/1024 [12:33:18<34:48:07, 167.72s/it][AINFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 278/1024 [12:35:50<33:46:12, 162.97s/it][A
+                                                         [A{'loss': -0.0056, 'grad_norm': 0.005685295443981886, 'learning_rate': 1e-05, 'num_tokens': 240156211.0, 'completions/mean_length': 6914.9609375, 'completions/min_length': 730.0, 'completions/max_length': 15321.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6914.9609375, 'completions/min_terminated_length': 730.0, 'completions/max_terminated_length': 15321.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021195171400904655, 'sampling/sampling_logp_difference/max': 9.997581481933594, 'sampling/importance_sampling_ratio/min': 4.5509867049986497e-05, 'sampling/importance_sampling_ratio/mean': 0.9998887777328491, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9700981751084328, 'clip_ratio/low_mean': 6.14647315160255e-05, 'clip_ratio/low_min': 5.043576493335422e-06, 'clip_ratio/high_mean': 5.369374321162468e-06, 'clip_ratio/high_max': 1.698448841125355e-05, 'clip_ratio/region_mean': 6.683410583718796e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 278/1024 [12:35:50<33:46:12, 162.97s/it][AINFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 279/1024 [12:38:12<32:27:38, 156.86s/it][A
+                                                         [A{'loss': 0.1246, 'grad_norm': 0.003880272386595607, 'learning_rate': 1e-05, 'num_tokens': 240845295.0, 'completions/mean_length': 5227.53125, 'completions/min_length': 647.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5139.68505859375, 'completions/min_terminated_length': 647.0, 'completions/max_terminated_length': 15469.0, 'rewards/accuracy_reward/mean': 0.6328125, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.6328125, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018801718950271606, 'sampling/sampling_logp_difference/max': 8.993386268615723, 'sampling/importance_sampling_ratio/min': 0.00012422871077433228, 'sampling/importance_sampling_ratio/mean': 1.0000362396240234, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9116031974554062, 'clip_ratio/low_mean': 2.9186837764427764e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.9186837764427764e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 279/1024 [12:38:12<32:27:38, 156.86s/it][AINFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 280/1024 [12:41:08<33:34:54, 162.49s/it][A
+                                                         [A{'loss': 0.0947, 'grad_norm': 0.0028986844699829817, 'learning_rate': 1e-05, 'num_tokens': 241895676.0, 'completions/mean_length': 8065.4765625, 'completions/min_length': 1055.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7510.90869140625, 'completions/min_terminated_length': 1055.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3474721610546112, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01853121444582939, 'sampling/sampling_logp_difference/max': 6.3748297691345215, 'sampling/importance_sampling_ratio/min': 0.0017039099475368857, 'sampling/importance_sampling_ratio/mean': 0.9999842643737793, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7446574792265892, 'clip_ratio/low_mean': 5.524710468307603e-05, 'clip_ratio/low_min': 3.776891389861703e-06, 'clip_ratio/high_mean': 8.084949570275057e-06, 'clip_ratio/high_max': 2.5015486926349695e-05, 'clip_ratio/region_mean': 6.333205465125502e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 280/1024 [12:41:08<33:34:54, 162.49s/it][AINFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 281/1024 [12:43:37<32:40:28, 158.32s/it][A
+                                                         [A{'loss': 0.0368, 'grad_norm': 0.003845847910270095, 'learning_rate': 1e-05, 'num_tokens': 242698258.0, 'completions/mean_length': 6127.359375, 'completions/min_length': 848.0, 'completions/max_length': 15534.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6127.359375, 'completions/min_terminated_length': 848.0, 'completions/max_terminated_length': 15534.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01856958493590355, 'sampling/sampling_logp_difference/max': 7.746356964111328, 'sampling/importance_sampling_ratio/min': 0.00043231461313553154, 'sampling/importance_sampling_ratio/mean': 1.0000942945480347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8569132760167122, 'clip_ratio/low_mean': 2.896106741445692e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.371585253513331e-06, 'clip_ratio/high_max': 9.486341014053323e-06, 'clip_ratio/region_mean': 3.133265261112683e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 281/1024 [12:43:37<32:40:28, 158.32s/it][AINFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 282/1024 [12:46:22<33:04:18, 160.46s/it][A
+                                                         [A{'loss': 0.0666, 'grad_norm': 0.003953634761273861, 'learning_rate': 1e-05, 'num_tokens': 243560957.0, 'completions/mean_length': 6600.1484375, 'completions/min_length': 1252.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6365.33642578125, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 15192.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018097909167408943, 'sampling/sampling_logp_difference/max': 7.334624767303467, 'sampling/importance_sampling_ratio/min': 0.0006525487406179309, 'sampling/importance_sampling_ratio/mean': 0.9999537467956543, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.78924310952425, 'clip_ratio/low_mean': 4.3558867673709756e-05, 'clip_ratio/low_min': 4.417741820361698e-06, 'clip_ratio/high_mean': 7.4620825216697995e-06, 'clip_ratio/high_max': 2.9848330086679198e-05, 'clip_ratio/region_mean': 5.1020949285884853e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 282/1024 [12:46:22<33:04:18, 160.46s/it][AINFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 283/1024 [12:49:12<33:35:54, 163.23s/it][A
+                                                         [A{'loss': 0.0265, 'grad_norm': 0.00360781978815794, 'learning_rate': 1e-05, 'num_tokens': 244585923.0, 'completions/mean_length': 7852.171875, 'completions/min_length': 1276.0, 'completions/max_length': 15755.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7852.171875, 'completions/min_terminated_length': 1276.0, 'completions/max_terminated_length': 15755.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.19438527524471283, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022330068051815033, 'sampling/sampling_logp_difference/max': 10.076086044311523, 'sampling/importance_sampling_ratio/min': 4.2073770600836724e-05, 'sampling/importance_sampling_ratio/mean': 0.9999812841415405, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0598893761634827, 'clip_ratio/low_mean': 2.737523408313791e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6588904259151604e-06, 'clip_ratio/high_max': 6.635561703660642e-06, 'clip_ratio/region_mean': 2.9034124281679397e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 283/1024 [12:49:12<33:35:54, 163.23s/it][AINFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 284/1024 [12:52:20<35:05:31, 170.72s/it][A
+                                                         [A{'loss': 0.0587, 'grad_norm': 0.0027661293279379606, 'learning_rate': 1e-05, 'num_tokens': 245628064.0, 'completions/mean_length': 7972.2265625, 'completions/min_length': 610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7700.87890625, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.1872510462999344, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021125148981809616, 'sampling/sampling_logp_difference/max': 10.366576194763184, 'sampling/importance_sampling_ratio/min': 3.1466843211092055e-05, 'sampling/importance_sampling_ratio/mean': 0.9999428987503052, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.933217465877533, 'clip_ratio/low_mean': 4.7973388973332476e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.885042236921436e-07, 'clip_ratio/high_max': 3.1540168947685743e-06, 'clip_ratio/region_mean': 4.876189268543385e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 284/1024 [12:52:20<35:05:31, 170.72s/it][AINFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 285/1024 [12:55:06<34:44:04, 169.21s/it][A
+                                                         [A{'loss': 0.0786, 'grad_norm': 0.005680318456143141, 'learning_rate': 1e-05, 'num_tokens': 246561329.0, 'completions/mean_length': 7135.6953125, 'completions/min_length': 640.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6913.736328125, 'completions/min_terminated_length': 640.0, 'completions/max_terminated_length': 15744.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3077537715435028, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018504241481423378, 'sampling/sampling_logp_difference/max': 9.737424850463867, 'sampling/importance_sampling_ratio/min': 5.9032357967225835e-05, 'sampling/importance_sampling_ratio/mean': 0.9999462366104126, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7786942347884178, 'clip_ratio/low_mean': 4.6317693090713874e-05, 'clip_ratio/low_min': 3.820877282123547e-06, 'clip_ratio/high_mean': 3.241492265715351e-06, 'clip_ratio/high_max': 1.2965969062861404e-05, 'clip_ratio/region_mean': 4.955918507221213e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 285/1024 [12:55:06<34:44:04, 169.21s/it][AINFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 286/1024 [12:57:53<34:35:02, 168.70s/it][A
+                                                         [A{'loss': 0.1072, 'grad_norm': 0.0026402862276881933, 'learning_rate': 1e-05, 'num_tokens': 247437415.0, 'completions/mean_length': 6704.046875, 'completions/min_length': 155.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6627.82666015625, 'completions/min_terminated_length': 155.0, 'completions/max_terminated_length': 16161.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.31276631355285645, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02134273201227188, 'sampling/sampling_logp_difference/max': 7.156195640563965, 'sampling/importance_sampling_ratio/min': 0.0007800163584761322, 'sampling/importance_sampling_ratio/mean': 0.9998904466629028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0435140281915665, 'clip_ratio/low_mean': 4.22437145175536e-05, 'clip_ratio/low_min': 1.4025082009538892e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.22437145175536e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 286/1024 [12:57:53<34:35:02, 168.70s/it][AINFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 287/1024 [13:00:28<33:41:57, 164.61s/it][A
+                                                         [A{'loss': 0.0565, 'grad_norm': 0.003993614576756954, 'learning_rate': 1e-05, 'num_tokens': 248211112.0, 'completions/mean_length': 5892.5078125, 'completions/min_length': 249.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5725.9765625, 'completions/min_terminated_length': 249.0, 'completions/max_terminated_length': 15708.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3322049677371979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01924925297498703, 'sampling/sampling_logp_difference/max': 6.005458354949951, 'sampling/importance_sampling_ratio/min': 0.0024652592837810516, 'sampling/importance_sampling_ratio/mean': 1.0000004768371582, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8004944771528244, 'clip_ratio/low_mean': 4.7084630978133646e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.8746438159905665e-06, 'clip_ratio/high_max': 2.223430897174694e-05, 'clip_ratio/region_mean': 5.3959275192028144e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 287/1024 [13:00:28<33:41:57, 164.61s/it][AINFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 288/1024 [13:03:43<35:31:01, 173.72s/it][A
+                                                         [A{'loss': 0.0823, 'grad_norm': 0.001573400106281042, 'learning_rate': 1e-05, 'num_tokens': 249228106.0, 'completions/mean_length': 7812.140625, 'completions/min_length': 1515.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7316.24755859375, 'completions/min_terminated_length': 1515.0, 'completions/max_terminated_length': 15892.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01956877112388611, 'sampling/sampling_logp_difference/max': 6.906228542327881, 'sampling/importance_sampling_ratio/min': 0.001001527882181108, 'sampling/importance_sampling_ratio/mean': 0.9998818635940552, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8841542899608612, 'clip_ratio/low_mean': 3.415995615796419e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.458270550207089e-06, 'clip_ratio/high_max': 2.1833082200828358e-05, 'clip_ratio/region_mean': 3.961822596920683e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 288/1024 [13:03:43<35:31:01, 173.72s/it][AINFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 289/1024 [13:06:33<35:13:31, 172.53s/it][A
+                                                         [A{'loss': 0.0438, 'grad_norm': 0.0021125099156051874, 'learning_rate': 1e-05, 'num_tokens': 250063284.0, 'completions/mean_length': 6372.953125, 'completions/min_length': 686.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6132.6884765625, 'completions/min_terminated_length': 686.0, 'completions/max_terminated_length': 16250.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01943521574139595, 'sampling/sampling_logp_difference/max': 9.937475204467773, 'sampling/importance_sampling_ratio/min': 4.8329173296224326e-05, 'sampling/importance_sampling_ratio/mean': 0.9999308586120605, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8228401988744736, 'clip_ratio/low_mean': 3.068193461785995e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.53609755418438e-06, 'clip_ratio/high_max': 1.014439021673752e-05, 'clip_ratio/region_mean': 3.321803217204433e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 289/1024 [13:06:33<35:13:31, 172.53s/it][AINFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 290/1024 [13:09:49<36:37:07, 179.60s/it][A
+                                                         [A{'loss': 0.0565, 'grad_norm': 0.0022315154783427715, 'learning_rate': 1e-05, 'num_tokens': 251085123.0, 'completions/mean_length': 7817.8671875, 'completions/min_length': 1568.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7396.58154296875, 'completions/min_terminated_length': 1568.0, 'completions/max_terminated_length': 16270.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021764669567346573, 'sampling/sampling_logp_difference/max': 12.760490417480469, 'sampling/importance_sampling_ratio/min': 2.8720330647047376e-06, 'sampling/importance_sampling_ratio/mean': 0.99993896484375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9454319775104523, 'clip_ratio/low_mean': 2.526416994896863e-05, 'clip_ratio/low_min': 6.7760895490209805e-06, 'clip_ratio/high_mean': 1.7559765410624095e-06, 'clip_ratio/high_max': 7.023906164249638e-06, 'clip_ratio/region_mean': 2.7020146660561295e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 290/1024 [13:09:49<36:37:07, 179.60s/it][AINFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 28%|██▊       | 291/1024 [13:12:50<36:38:48, 179.98s/it][A
+                                                         [A{'loss': 0.0808, 'grad_norm': 0.004663965664803982, 'learning_rate': 1e-05, 'num_tokens': 252020906.0, 'completions/mean_length': 7168.4921875, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6635.36328125, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 16352.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2927239239215851, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01929781585931778, 'sampling/sampling_logp_difference/max': 7.861782550811768, 'sampling/importance_sampling_ratio/min': 0.0003851866349577904, 'sampling/importance_sampling_ratio/mean': 0.9999589920043945, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8433891162276268, 'clip_ratio/low_mean': 4.36271948274225e-05, 'clip_ratio/low_min': 3.6957101201551268e-06, 'clip_ratio/high_mean': 3.699491571751423e-06, 'clip_ratio/high_max': 1.4797966287005693e-05, 'clip_ratio/region_mean': 4.732668639917392e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 291/1024 [13:12:50<36:38:48, 179.98s/it][AINFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▊       | 292/1024 [13:15:34<35:38:33, 175.29s/it][A
+                                                         [A{'loss': 0.079, 'grad_norm': 0.0036942458245903254, 'learning_rate': 1e-05, 'num_tokens': 252977435.0, 'completions/mean_length': 7322.5078125, 'completions/min_length': 1196.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6876.8603515625, 'completions/min_terminated_length': 1196.0, 'completions/max_terminated_length': 16301.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.24275577068328857, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0205365102738142, 'sampling/sampling_logp_difference/max': 8.124969482421875, 'sampling/importance_sampling_ratio/min': 0.00029605376766994596, 'sampling/importance_sampling_ratio/mean': 0.9999804496765137, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9157031401991844, 'clip_ratio/low_mean': 4.2792244585143635e-05, 'clip_ratio/low_min': 1.0337215371691855e-05, 'clip_ratio/high_mean': 6.089093403716106e-06, 'clip_ratio/high_max': 1.996871560550062e-05, 'clip_ratio/region_mean': 4.8881338216233416e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 292/1024 [13:15:34<35:38:33, 175.29s/it][AINFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▊       | 293/1024 [13:18:20<35:01:00, 172.45s/it][A
+                                                         [A{'loss': 0.0227, 'grad_norm': 0.0034127074759453535, 'learning_rate': 1e-05, 'num_tokens': 253896161.0, 'completions/mean_length': 7025.484375, 'completions/min_length': 337.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6723.5966796875, 'completions/min_terminated_length': 337.0, 'completions/max_terminated_length': 16078.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.27722424268722534, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.023741140961647034, 'sampling/sampling_logp_difference/max': 7.562129497528076, 'sampling/importance_sampling_ratio/min': 0.0005197672289796174, 'sampling/importance_sampling_ratio/mean': 0.9999400973320007, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1329731941223145, 'clip_ratio/low_mean': 2.631919460327481e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.631919460327481e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 293/1024 [13:18:20<35:01:00, 172.45s/it][AINFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▊       | 294/1024 [13:21:07<34:36:46, 170.69s/it][A
+                                                         [A{'loss': 0.0156, 'grad_norm': 0.003959407564252615, 'learning_rate': 1e-05, 'num_tokens': 254690264.0, 'completions/mean_length': 5996.1796875, 'completions/min_length': 882.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5661.08837890625, 'completions/min_terminated_length': 882.0, 'completions/max_terminated_length': 13776.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.26645541191101074, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018407585099339485, 'sampling/sampling_logp_difference/max': 15.73043155670166, 'sampling/importance_sampling_ratio/min': 1.4735347519945208e-07, 'sampling/importance_sampling_ratio/mean': 0.9999563694000244, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8773328885436058, 'clip_ratio/low_mean': 2.4661783299961826e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.092160914595297e-06, 'clip_ratio/high_max': 4.368643658381188e-06, 'clip_ratio/region_mean': 2.5753944555617636e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 294/1024 [13:21:07<34:36:46, 170.69s/it][AINFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 295/1024 [13:23:49<34:02:37, 168.12s/it][A
+                                                         [A{'loss': 0.0541, 'grad_norm': 0.0030910037457942963, 'learning_rate': 1e-05, 'num_tokens': 255626394.0, 'completions/mean_length': 7165.328125, 'completions/min_length': 1115.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6867.951171875, 'completions/min_terminated_length': 1115.0, 'completions/max_terminated_length': 16383.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020668907091021538, 'sampling/sampling_logp_difference/max': 8.407832145690918, 'sampling/importance_sampling_ratio/min': 0.00022311302018351853, 'sampling/importance_sampling_ratio/mean': 1.0000731945037842, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9502597972750664, 'clip_ratio/low_mean': 3.736187466074625e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.041209194838302e-06, 'clip_ratio/high_max': 1.616483677935321e-05, 'clip_ratio/region_mean': 4.140308453770558e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 295/1024 [13:23:49<34:02:37, 168.12s/it][AINFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 296/1024 [13:26:45<34:29:10, 170.54s/it][A
+                                                         [A{'loss': 0.0463, 'grad_norm': 0.0037233952898532152, 'learning_rate': 1e-05, 'num_tokens': 256673457.0, 'completions/mean_length': 8001.9296875, 'completions/min_length': 164.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7661.34912109375, 'completions/min_terminated_length': 164.0, 'completions/max_terminated_length': 15375.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020540472120046616, 'sampling/sampling_logp_difference/max': 6.124904632568359, 'sampling/importance_sampling_ratio/min': 0.0021876997780054808, 'sampling/importance_sampling_ratio/mean': 0.9999151229858398, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8591345250606537, 'clip_ratio/low_mean': 5.5247357522603124e-05, 'clip_ratio/low_min': 3.6811261452385224e-06, 'clip_ratio/high_mean': 2.9256716516101733e-06, 'clip_ratio/high_max': 1.1702686606440693e-05, 'clip_ratio/region_mean': 5.8173028264718596e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 296/1024 [13:26:45<34:29:10, 170.54s/it][AINFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 297/1024 [13:29:27<33:55:33, 168.00s/it][A
+                                                         [A{'loss': 0.0669, 'grad_norm': 0.006054217461496592, 'learning_rate': 1e-05, 'num_tokens': 257578501.0, 'completions/mean_length': 6924.84375, 'completions/min_length': 803.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6697.82421875, 'completions/min_terminated_length': 803.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2927239239215851, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019306108355522156, 'sampling/sampling_logp_difference/max': 4.842195510864258, 'sampling/importance_sampling_ratio/min': 0.007889713160693645, 'sampling/importance_sampling_ratio/mean': 0.9999213218688965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7969356626272202, 'clip_ratio/low_mean': 3.570647322703735e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2759249216287571e-05, 'clip_ratio/high_max': 3.721341136042611e-05, 'clip_ratio/region_mean': 4.846572301175911e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 297/1024 [13:29:27<33:55:33, 168.00s/it][AINFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 298/1024 [13:31:59<32:55:41, 163.28s/it][A
+                                                         [A{'loss': 0.0689, 'grad_norm': 0.004903806839138269, 'learning_rate': 1e-05, 'num_tokens': 258392625.0, 'completions/mean_length': 6203.03125, 'completions/min_length': 180.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5958.6884765625, 'completions/min_terminated_length': 180.0, 'completions/max_terminated_length': 14439.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01909301057457924, 'sampling/sampling_logp_difference/max': 8.498823165893555, 'sampling/importance_sampling_ratio/min': 0.00020370795391499996, 'sampling/importance_sampling_ratio/mean': 0.9999826550483704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8734413683414459, 'clip_ratio/low_mean': 5.2388056587915344e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5528859737278253e-06, 'clip_ratio/high_max': 1.0211543894911301e-05, 'clip_ratio/region_mean': 5.4940942732173426e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 298/1024 [13:31:59<32:55:41, 163.28s/it][AINFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 299/1024 [13:34:57<33:44:39, 167.56s/it][A
+                                                         [A{'loss': 0.0625, 'grad_norm': 0.0033637424930930138, 'learning_rate': 1e-05, 'num_tokens': 259435270.0, 'completions/mean_length': 7982.5390625, 'completions/min_length': 776.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7641.01611328125, 'completions/min_terminated_length': 776.0, 'completions/max_terminated_length': 15554.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.31246691942214966, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02182736061513424, 'sampling/sampling_logp_difference/max': 6.406092166900635, 'sampling/importance_sampling_ratio/min': 0.0016514655435457826, 'sampling/importance_sampling_ratio/mean': 0.9999765753746033, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0091779381036758, 'clip_ratio/low_mean': 4.373456977191381e-05, 'clip_ratio/low_min': 3.670856358439778e-06, 'clip_ratio/high_mean': 4.64845766146027e-06, 'clip_ratio/high_max': 1.5135058674786706e-05, 'clip_ratio/region_mean': 4.8383026296505705e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 299/1024 [13:34:57<33:44:39, 167.56s/it][AINFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 300/1024 [13:37:41<33:29:23, 166.52s/it][A
+                                                         [A{'loss': 0.144, 'grad_norm': 0.0052203768864274025, 'learning_rate': 1e-05, 'num_tokens': 260337614.0, 'completions/mean_length': 6915.3125, 'completions/min_length': 778.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6688.064453125, 'completions/min_terminated_length': 778.0, 'completions/max_terminated_length': 16265.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.37928223609924316, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017896221950650215, 'sampling/sampling_logp_difference/max': 9.562335968017578, 'sampling/importance_sampling_ratio/min': 7.032832218101248e-05, 'sampling/importance_sampling_ratio/mean': 0.9999016523361206, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7964543774724007, 'clip_ratio/low_mean': 5.2442986770984135e-05, 'clip_ratio/low_min': 8.75736759553547e-06, 'clip_ratio/high_mean': 5.991175669350923e-06, 'clip_ratio/high_max': 2.3964702677403693e-05, 'clip_ratio/region_mean': 5.843416238349164e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 300/1024 [13:37:41<33:29:23, 166.52s/it][AINFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 301/1024 [13:40:24<33:14:21, 165.51s/it][A
+                                                         [A{'loss': 0.0984, 'grad_norm': 0.005570738110691309, 'learning_rate': 1e-05, 'num_tokens': 261254070.0, 'completions/mean_length': 7029.4375, 'completions/min_length': 679.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6880.95263671875, 'completions/min_terminated_length': 679.0, 'completions/max_terminated_length': 16198.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3327290117740631, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01905740052461624, 'sampling/sampling_logp_difference/max': 7.005340576171875, 'sampling/importance_sampling_ratio/min': 0.0009070249507203698, 'sampling/importance_sampling_ratio/mean': 0.9999494552612305, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8605096861720085, 'clip_ratio/low_mean': 6.243192206056847e-05, 'clip_ratio/low_min': 1.2397775662975619e-05, 'clip_ratio/high_mean': 1.1145679081892013e-05, 'clip_ratio/high_max': 4.458271632756805e-05, 'clip_ratio/region_mean': 7.357759886872373e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 301/1024 [13:40:24<33:14:21, 165.51s/it][AINFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 29%|██▉       | 302/1024 [13:43:06<32:59:33, 164.51s/it][A
+                                                         [A{'loss': 0.062, 'grad_norm': 0.004496110137552023, 'learning_rate': 1e-05, 'num_tokens': 262024906.0, 'completions/mean_length': 5858.84375, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5606.240234375, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 15987.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019225869327783585, 'sampling/sampling_logp_difference/max': 7.812377452850342, 'sampling/importance_sampling_ratio/min': 0.00040469475788995624, 'sampling/importance_sampling_ratio/mean': 0.9999294877052307, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8430554121732712, 'clip_ratio/low_mean': 7.46641262594494e-05, 'clip_ratio/low_min': 5.041745680500753e-06, 'clip_ratio/high_mean': 1.1191766247975465e-05, 'clip_ratio/high_max': 3.390461233720998e-05, 'clip_ratio/region_mean': 8.585589102949598e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 302/1024 [13:43:06<32:59:33, 164.51s/it][AINFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 303/1024 [13:46:10<34:07:24, 170.38s/it][A
+                                                         [A{'loss': -0.0002, 'grad_norm': 0.00348713924176991, 'learning_rate': 1e-05, 'num_tokens': 263110844.0, 'completions/mean_length': 8337.328125, 'completions/min_length': 837.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7728.7568359375, 'completions/min_terminated_length': 837.0, 'completions/max_terminated_length': 15976.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.20805485546588898, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02157524600625038, 'sampling/sampling_logp_difference/max': 6.090071678161621, 'sampling/importance_sampling_ratio/min': 0.0022652465850114822, 'sampling/importance_sampling_ratio/mean': 0.9998900890350342, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.901745393872261, 'clip_ratio/low_mean': 3.7080020149460324e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.140988825289242e-07, 'clip_ratio/high_max': 3.2563955301156966e-06, 'clip_ratio/region_mean': 3.789411886145899e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 303/1024 [13:46:10<34:07:24, 170.38s/it][AINFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 304/1024 [13:49:10<34:36:46, 173.06s/it][A
+                                                         [A{'loss': 0.0551, 'grad_norm': 0.003980033565312624, 'learning_rate': 1e-05, 'num_tokens': 264036169.0, 'completions/mean_length': 7084.7265625, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6381.42041015625, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.27434611320495605, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018782664090394974, 'sampling/sampling_logp_difference/max': 8.999666213989258, 'sampling/importance_sampling_ratio/min': 0.00012345099821686745, 'sampling/importance_sampling_ratio/mean': 0.9999673366546631, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8265534415841103, 'clip_ratio/low_mean': 2.823553325015382e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.934936211815511e-06, 'clip_ratio/high_max': 2.3739744847262045e-05, 'clip_ratio/region_mean': 3.417046866616147e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 304/1024 [13:49:10<34:36:46, 173.06s/it][AINFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 305/1024 [13:51:51<33:52:34, 169.62s/it][A
+                                                         [A{'loss': 0.1139, 'grad_norm': 0.006467343773692846, 'learning_rate': 1e-05, 'num_tokens': 264892767.0, 'completions/mean_length': 6543.796875, 'completions/min_length': 93.0, 'completions/max_length': 16292.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6543.796875, 'completions/min_terminated_length': 93.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3934885561466217, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.02032080665230751, 'sampling/sampling_logp_difference/max': 9.221251487731934, 'sampling/importance_sampling_ratio/min': 9.891482477542013e-05, 'sampling/importance_sampling_ratio/mean': 1.0000489950180054, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8899869695305824, 'clip_ratio/low_mean': 6.913120819262986e-05, 'clip_ratio/low_min': 2.494283216947224e-05, 'clip_ratio/high_mean': 3.771558226617344e-06, 'clip_ratio/high_max': 1.1745505617000163e-05, 'clip_ratio/region_mean': 7.290276607818669e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 305/1024 [13:51:51<33:52:34, 169.62s/it][AINFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 306/1024 [13:54:58<34:50:44, 174.71s/it][A
+                                                         [A{'loss': 0.0948, 'grad_norm': 0.003174177836626768, 'learning_rate': 1e-05, 'num_tokens': 265995697.0, 'completions/mean_length': 8483.390625, 'completions/min_length': 1342.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7813.84765625, 'completions/min_terminated_length': 1342.0, 'completions/max_terminated_length': 16307.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02145479805767536, 'sampling/sampling_logp_difference/max': 7.4824934005737305, 'sampling/importance_sampling_ratio/min': 0.0005628522485494614, 'sampling/importance_sampling_ratio/mean': 1.0000269412994385, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9621479511260986, 'clip_ratio/low_mean': 4.395576979732141e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.395576979732141e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 306/1024 [13:54:58<34:50:44, 174.71s/it][AINFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|██▉       | 307/1024 [13:57:43<34:13:09, 171.81s/it][A
+                                                         [A{'loss': 0.0887, 'grad_norm': 0.003356153378263116, 'learning_rate': 1e-05, 'num_tokens': 266937707.0, 'completions/mean_length': 7184.578125, 'completions/min_length': 419.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6963.79248046875, 'completions/min_terminated_length': 419.0, 'completions/max_terminated_length': 14985.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02204768732190132, 'sampling/sampling_logp_difference/max': 6.374974727630615, 'sampling/importance_sampling_ratio/min': 0.0017036627978086472, 'sampling/importance_sampling_ratio/mean': 1.0000238418579102, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9993807673454285, 'clip_ratio/low_mean': 3.7911659774181317e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.149131202917488e-06, 'clip_ratio/high_max': 1.2596524811669951e-05, 'clip_ratio/region_mean': 4.106079018129094e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 307/1024 [13:57:43<34:13:09, 171.81s/it][AINFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 308/1024 [14:00:35<34:11:36, 171.92s/it][A
+                                                         [A{'loss': 0.0519, 'grad_norm': 0.006375293247401714, 'learning_rate': 1e-05, 'num_tokens': 267853880.0, 'completions/mean_length': 7029.2265625, 'completions/min_length': 851.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6727.45947265625, 'completions/min_terminated_length': 851.0, 'completions/max_terminated_length': 16216.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.27328038215637207, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020365029573440552, 'sampling/sampling_logp_difference/max': 4.542207717895508, 'sampling/importance_sampling_ratio/min': 0.010649868287146091, 'sampling/importance_sampling_ratio/mean': 1.000023365020752, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9139953926205635, 'clip_ratio/low_mean': 4.8845648166206956e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.811290921225009e-06, 'clip_ratio/high_max': 1.9245163684900035e-05, 'clip_ratio/region_mean': 5.365693925796222e-05, 'epoch': 0.28}
+
+ 30%|███       | 308/1024 [14:00:35<34:11:36, 171.92s/it][AINFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 309/1024 [14:03:09<33:05:43, 166.63s/it][A
+                                                         [A{'loss': 0.0733, 'grad_norm': 0.003697809297591448, 'learning_rate': 1e-05, 'num_tokens': 268665721.0, 'completions/mean_length': 6188.0078125, 'completions/min_length': 612.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5943.30419921875, 'completions/min_terminated_length': 612.0, 'completions/max_terminated_length': 16106.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.20699402689933777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.016581017524003983, 'sampling/sampling_logp_difference/max': 3.531106472015381, 'sampling/importance_sampling_ratio/min': 0.02927250787615776, 'sampling/importance_sampling_ratio/mean': 0.9999372363090515, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7640773430466652, 'clip_ratio/low_mean': 2.5999243803198624e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2031262031086953e-06, 'clip_ratio/high_max': 4.812504812434781e-06, 'clip_ratio/region_mean': 2.720237000630732e-05, 'epoch': 0.28}
+
+ 30%|███       | 309/1024 [14:03:09<33:05:43, 166.63s/it][AINFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 310/1024 [14:06:11<33:57:39, 171.23s/it][A
+                                                         [A{'loss': 0.0763, 'grad_norm': 0.002286596456542611, 'learning_rate': 1e-05, 'num_tokens': 269726181.0, 'completions/mean_length': 8128.21875, 'completions/min_length': 1227.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7861.90283203125, 'completions/min_terminated_length': 1227.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019403984770178795, 'sampling/sampling_logp_difference/max': 12.90043830871582, 'sampling/importance_sampling_ratio/min': 2.4969556307041785e-06, 'sampling/importance_sampling_ratio/mean': 0.9999798536300659, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8218234181404114, 'clip_ratio/low_mean': 2.1358927824621787e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.1358927824621787e-05, 'epoch': 0.29}
+
+ 30%|███       | 310/1024 [14:06:11<33:57:39, 171.23s/it][AINFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 311/1024 [14:08:33<32:08:31, 162.29s/it][A
+                                                         [A{'loss': 0.0193, 'grad_norm': 0.00485506234690547, 'learning_rate': 1e-05, 'num_tokens': 270470616.0, 'completions/mean_length': 5673.3359375, 'completions/min_length': 306.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5503.32568359375, 'completions/min_terminated_length': 306.0, 'completions/max_terminated_length': 16256.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01881871558725834, 'sampling/sampling_logp_difference/max': 6.999490737915039, 'sampling/importance_sampling_ratio/min': 0.0009123464697040617, 'sampling/importance_sampling_ratio/mean': 1.0000226497650146, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9275510385632515, 'clip_ratio/low_mean': 3.0363167581981543e-05, 'clip_ratio/low_min': 6.364238288369961e-06, 'clip_ratio/high_mean': 3.7021193293185206e-06, 'clip_ratio/high_max': 1.4808477317274082e-05, 'clip_ratio/region_mean': 3.4065286854456645e-05, 'epoch': 0.29}
+
+ 30%|███       | 311/1024 [14:08:33<32:08:31, 162.29s/it][AINFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 30%|███       | 312/1024 [14:11:32<33:04:54, 167.27s/it][A
+                                                         [A{'loss': 0.032, 'grad_norm': 0.005874342750757933, 'learning_rate': 1e-05, 'num_tokens': 271377723.0, 'completions/mean_length': 6944.8984375, 'completions/min_length': 896.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6795.07177734375, 'completions/min_terminated_length': 896.0, 'completions/max_terminated_length': 16382.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020590776577591896, 'sampling/sampling_logp_difference/max': 10.049861907958984, 'sampling/importance_sampling_ratio/min': 4.3191710574319586e-05, 'sampling/importance_sampling_ratio/mean': 1.0000594854354858, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9335741624236107, 'clip_ratio/low_mean': 3.968570712231667e-05, 'clip_ratio/low_min': 3.4213767321489286e-06, 'clip_ratio/high_mean': 3.6739949109687586e-06, 'clip_ratio/high_max': 1.1274602456978755e-05, 'clip_ratio/region_mean': 4.335970191959859e-05, 'epoch': 0.29}
+
+ 30%|███       | 312/1024 [14:11:32<33:04:54, 167.27s/it][AINFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 313/1024 [14:14:25<33:22:48, 169.01s/it][A
+                                                         [A{'loss': 0.06, 'grad_norm': 0.001684082904830575, 'learning_rate': 1e-05, 'num_tokens': 272384891.0, 'completions/mean_length': 7705.625, 'completions/min_length': 329.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7278.8193359375, 'completions/min_terminated_length': 329.0, 'completions/max_terminated_length': 15806.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2648528814315796, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020136822015047073, 'sampling/sampling_logp_difference/max': 9.624967575073242, 'sampling/importance_sampling_ratio/min': 6.605865200981498e-05, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8491624072194099, 'clip_ratio/low_mean': 3.206376845810155e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.16031673719408e-06, 'clip_ratio/high_max': 1.264126694877632e-05, 'clip_ratio/region_mean': 3.522408474054828e-05, 'epoch': 0.29}
+
+ 31%|███       | 313/1024 [14:14:25<33:22:48, 169.01s/it][AINFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 314/1024 [14:17:10<33:05:18, 167.77s/it][A
+                                                         [A{'loss': 0.1233, 'grad_norm': 0.003692191792652011, 'learning_rate': 1e-05, 'num_tokens': 273251630.0, 'completions/mean_length': 6611.1484375, 'completions/min_length': 1116.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6534.19677734375, 'completions/min_terminated_length': 1116.0, 'completions/max_terminated_length': 15923.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.27564430236816406, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019237037748098373, 'sampling/sampling_logp_difference/max': 5.774331569671631, 'sampling/importance_sampling_ratio/min': 0.0031062732450664043, 'sampling/importance_sampling_ratio/mean': 0.9999606609344482, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8867302760481834, 'clip_ratio/low_mean': 3.8573590472879005e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.443089442749624e-06, 'clip_ratio/high_max': 9.772357770998497e-06, 'clip_ratio/region_mean': 4.101667946088128e-05, 'epoch': 0.29}
+
+ 31%|███       | 314/1024 [14:17:10<33:05:18, 167.77s/it][AINFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 315/1024 [14:19:49<32:32:32, 165.24s/it][A
+                                                         [A{'loss': -0.0072, 'grad_norm': 0.004167635925114155, 'learning_rate': 1e-05, 'num_tokens': 274146482.0, 'completions/mean_length': 6770.46875, 'completions/min_length': 957.0, 'completions/max_length': 15786.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6770.46875, 'completions/min_terminated_length': 957.0, 'completions/max_terminated_length': 15786.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.23486016690731049, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019684650003910065, 'sampling/sampling_logp_difference/max': 9.18593978881836, 'sampling/importance_sampling_ratio/min': 0.00010247006866848096, 'sampling/importance_sampling_ratio/mean': 1.000013828277588, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8252957463264465, 'clip_ratio/low_mean': 1.7575501146893657e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.664363972206047e-06, 'clip_ratio/high_max': 3.0103737344688852e-05, 'clip_ratio/region_mean': 2.723986426644842e-05, 'epoch': 0.29}
+
+ 31%|███       | 315/1024 [14:19:49<32:32:32, 165.24s/it][AINFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 316/1024 [14:22:58<33:52:58, 172.29s/it][A
+                                                         [A{'loss': 0.0531, 'grad_norm': 0.0030363225378096104, 'learning_rate': 1e-05, 'num_tokens': 275214040.0, 'completions/mean_length': 8210.859375, 'completions/min_length': 891.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7365.36181640625, 'completions/min_terminated_length': 891.0, 'completions/max_terminated_length': 15827.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019275270402431488, 'sampling/sampling_logp_difference/max': 5.858705997467041, 'sampling/importance_sampling_ratio/min': 0.002854935359209776, 'sampling/importance_sampling_ratio/mean': 0.9998943209648132, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8118235394358635, 'clip_ratio/low_mean': 3.877351048231503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6323651834682096e-06, 'clip_ratio/high_max': 6.529460733872838e-06, 'clip_ratio/region_mean': 4.040587566578324e-05, 'epoch': 0.29}
+
+ 31%|███       | 316/1024 [14:22:58<33:52:58, 172.29s/it][AINFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 317/1024 [14:26:01<34:27:13, 175.44s/it][A
+                                                         [A{'loss': 0.0404, 'grad_norm': 0.004777858033776283, 'learning_rate': 1e-05, 'num_tokens': 276138049.0, 'completions/mean_length': 7072.8828125, 'completions/min_length': 374.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6849.41650390625, 'completions/min_terminated_length': 374.0, 'completions/max_terminated_length': 14900.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01849908009171486, 'sampling/sampling_logp_difference/max': 5.860339164733887, 'sampling/importance_sampling_ratio/min': 0.0028502768836915493, 'sampling/importance_sampling_ratio/mean': 0.9999368190765381, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8018335327506065, 'clip_ratio/low_mean': 2.3981688286767167e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7700157286526519e-06, 'clip_ratio/high_max': 7.0800629146106075e-06, 'clip_ratio/region_mean': 2.5751703674359305e-05, 'epoch': 0.29}
+
+ 31%|███       | 317/1024 [14:26:01<34:27:13, 175.44s/it][AINFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 318/1024 [14:28:37<33:16:43, 169.69s/it][A
+                                                         [A{'loss': 0.1331, 'grad_norm': 0.0030593445990234613, 'learning_rate': 1e-05, 'num_tokens': 276910124.0, 'completions/mean_length': 5889.8359375, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5723.26220703125, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 14447.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3621976971626282, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01834402233362198, 'sampling/sampling_logp_difference/max': 8.874987602233887, 'sampling/importance_sampling_ratio/min': 0.000139843366923742, 'sampling/importance_sampling_ratio/mean': 0.9999091029167175, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7976400703191757, 'clip_ratio/low_mean': 4.28424866640853e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.648421506521117e-06, 'clip_ratio/high_max': 2.259368602608447e-05, 'clip_ratio/region_mean': 4.849090737479855e-05, 'epoch': 0.29}
+
+ 31%|███       | 318/1024 [14:28:37<33:16:43, 169.69s/it][AINFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███       | 319/1024 [14:31:38<33:54:05, 173.11s/it][A
+                                                         [A{'loss': 0.077, 'grad_norm': 0.004245694726705551, 'learning_rate': 1e-05, 'num_tokens': 277843542.0, 'completions/mean_length': 7144.265625, 'completions/min_length': 1200.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6689.85205078125, 'completions/min_terminated_length': 1200.0, 'completions/max_terminated_length': 16324.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.24541422724723816, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01875344291329384, 'sampling/sampling_logp_difference/max': 11.499897956848145, 'sampling/importance_sampling_ratio/min': 1.0131127055501565e-05, 'sampling/importance_sampling_ratio/mean': 0.9998534321784973, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8309404999017715, 'clip_ratio/low_mean': 2.377464920755301e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.663561756184208e-06, 'clip_ratio/high_max': 1.4654247024736833e-05, 'clip_ratio/region_mean': 2.7438210736363544e-05, 'epoch': 0.29}
+
+ 31%|███       | 319/1024 [14:31:38<33:54:05, 173.11s/it][AINFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███▏      | 320/1024 [14:34:24<33:27:11, 171.07s/it][A
+                                                         [A{'loss': 0.0723, 'grad_norm': 0.0035574575886130333, 'learning_rate': 1e-05, 'num_tokens': 278730129.0, 'completions/mean_length': 6779.5234375, 'completions/min_length': 767.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6703.8974609375, 'completions/min_terminated_length': 767.0, 'completions/max_terminated_length': 15722.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.32825323939323425, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02137477695941925, 'sampling/sampling_logp_difference/max': 5.151239395141602, 'sampling/importance_sampling_ratio/min': 0.005792221520096064, 'sampling/importance_sampling_ratio/mean': 0.9999299645423889, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9584890529513359, 'clip_ratio/low_mean': 4.735765514851664e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.241558604509919e-06, 'clip_ratio/high_max': 6.252500952541595e-06, 'clip_ratio/region_mean': 4.9599213525652885e-05, 'epoch': 0.29}
+
+ 31%|███▏      | 320/1024 [14:34:24<33:27:11, 171.07s/it][AINFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███▏      | 321/1024 [14:36:54<32:11:12, 164.83s/it][A
+                                                         [A{'loss': 0.0331, 'grad_norm': 0.0037982752546668053, 'learning_rate': 1e-05, 'num_tokens': 279462542.0, 'completions/mean_length': 5582.9765625, 'completions/min_length': 781.0, 'completions/max_length': 15892.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5582.9765625, 'completions/min_terminated_length': 781.0, 'completions/max_terminated_length': 15892.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3164186477661133, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01906203106045723, 'sampling/sampling_logp_difference/max': 6.124997138977051, 'sampling/importance_sampling_ratio/min': 0.0021874974481761456, 'sampling/importance_sampling_ratio/mean': 0.9999780058860779, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8629376217722893, 'clip_ratio/low_mean': 2.195712454522436e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.518853403278627e-06, 'clip_ratio/high_max': 3.2948471016425174e-05, 'clip_ratio/region_mean': 3.14759782895635e-05, 'epoch': 0.3}
+
+ 31%|███▏      | 321/1024 [14:36:54<32:11:12, 164.83s/it][AINFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 31%|███▏      | 322/1024 [14:39:55<33:02:52, 169.48s/it][A
+                                                         [A{'loss': 0.0585, 'grad_norm': 0.0027678858023136854, 'learning_rate': 1e-05, 'num_tokens': 280370207.0, 'completions/mean_length': 6942.2578125, 'completions/min_length': 1156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6477.90966796875, 'completions/min_terminated_length': 1156.0, 'completions/max_terminated_length': 16204.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3066929280757904, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01940828748047352, 'sampling/sampling_logp_difference/max': 8.3748779296875, 'sampling/importance_sampling_ratio/min': 0.00023058800434228033, 'sampling/importance_sampling_ratio/mean': 0.9998471736907959, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8147861957550049, 'clip_ratio/low_mean': 5.367386921761863e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.757368406491878e-06, 'clip_ratio/high_max': 1.1029473625967512e-05, 'clip_ratio/region_mean': 5.6431237737797346e-05, 'epoch': 0.3}
+
+ 31%|███▏      | 322/1024 [14:39:55<33:02:52, 169.48s/it][AINFO 12-02 04:04:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:04:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:04:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:04:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 323/1024 [14:42:30<32:10:12, 165.21s/it][A
+                                                         [A{'loss': 0.0839, 'grad_norm': 0.00577945914119482, 'learning_rate': 1e-05, 'num_tokens': 281189491.0, 'completions/mean_length': 6242.53125, 'completions/min_length': 1220.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5915.38671875, 'completions/min_terminated_length': 1220.0, 'completions/max_terminated_length': 15782.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01948760263621807, 'sampling/sampling_logp_difference/max': 9.2499418258667, 'sampling/importance_sampling_ratio/min': 9.611724817659706e-05, 'sampling/importance_sampling_ratio/mean': 0.9999679327011108, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.878915011882782, 'clip_ratio/low_mean': 3.232976985145797e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.765707434577052e-06, 'clip_ratio/high_max': 2.6367894406575942e-05, 'clip_ratio/region_mean': 4.109547796815605e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 323/1024 [14:42:30<32:10:12, 165.21s/it][AINFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 324/1024 [14:45:38<33:25:44, 171.92s/it][A
+                                                         [A{'loss': 0.0648, 'grad_norm': 0.0014128695474937558, 'learning_rate': 1e-05, 'num_tokens': 282103997.0, 'completions/mean_length': 7004.015625, 'completions/min_length': 224.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6622.71533203125, 'completions/min_terminated_length': 224.0, 'completions/max_terminated_length': 16310.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019019678235054016, 'sampling/sampling_logp_difference/max': 6.011474609375, 'sampling/importance_sampling_ratio/min': 0.0024504722096025944, 'sampling/importance_sampling_ratio/mean': 0.9999747276306152, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7964659407734871, 'clip_ratio/low_mean': 1.833109013205103e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1664920634757436e-05, 'clip_ratio/high_max': 3.50839609382092e-05, 'clip_ratio/region_mean': 2.9996010880495305e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 324/1024 [14:45:38<33:25:44, 171.92s/it][AINFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 325/1024 [14:48:25<33:06:48, 170.54s/it][A
+                                                         [A{'loss': 0.0515, 'grad_norm': 0.002476039342582226, 'learning_rate': 1e-05, 'num_tokens': 283122382.0, 'completions/mean_length': 7822.6953125, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7546.52392578125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 15318.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020557202398777008, 'sampling/sampling_logp_difference/max': 6.930576324462891, 'sampling/importance_sampling_ratio/min': 0.0009774373611435294, 'sampling/importance_sampling_ratio/mean': 0.9999314546585083, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8571138679981232, 'clip_ratio/low_mean': 5.309064226821647e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.580651363994548e-06, 'clip_ratio/high_max': 1.832260545597819e-05, 'clip_ratio/region_mean': 5.767129368905444e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 325/1024 [14:48:25<33:06:48, 170.54s/it][AINFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 326/1024 [14:51:27<33:44:26, 174.02s/it][A
+                                                         [A{'loss': 0.043, 'grad_norm': 0.005309853237122297, 'learning_rate': 1e-05, 'num_tokens': 284130081.0, 'completions/mean_length': 7738.8984375, 'completions/min_length': 897.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 6844.57763671875, 'completions/min_terminated_length': 897.0, 'completions/max_terminated_length': 16319.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01873316988348961, 'sampling/sampling_logp_difference/max': 8.933455467224121, 'sampling/importance_sampling_ratio/min': 0.0001319014554610476, 'sampling/importance_sampling_ratio/mean': 0.9998971223831177, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7839021533727646, 'clip_ratio/low_mean': 4.19679121819172e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4346049965752172e-06, 'clip_ratio/high_max': 5.738419986300869e-06, 'clip_ratio/region_mean': 4.3402517292179255e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 326/1024 [14:51:27<33:44:26, 174.02s/it][AINFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 327/1024 [14:54:13<33:11:44, 171.46s/it][A
+                                                         [A{'loss': 0.0574, 'grad_norm': 0.004228116944432259, 'learning_rate': 1e-05, 'num_tokens': 285058720.0, 'completions/mean_length': 7102.2421875, 'completions/min_length': 529.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6954.9130859375, 'completions/min_terminated_length': 529.0, 'completions/max_terminated_length': 15952.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019325006753206253, 'sampling/sampling_logp_difference/max': 8.951294898986816, 'sampling/importance_sampling_ratio/min': 0.00012956927821505815, 'sampling/importance_sampling_ratio/mean': 0.9999712705612183, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8530801385641098, 'clip_ratio/low_mean': 4.043528815600439e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5177145062116324e-06, 'clip_ratio/high_max': 1.007085802484653e-05, 'clip_ratio/region_mean': 4.295300277590286e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 327/1024 [14:54:13<33:11:44, 171.46s/it][AINFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 328/1024 [14:56:54<32:32:24, 168.31s/it][A
+                                                         [A{'loss': 0.0374, 'grad_norm': 0.004967439454048872, 'learning_rate': 1e-05, 'num_tokens': 285919765.0, 'completions/mean_length': 6583.4765625, 'completions/min_length': 718.0, 'completions/max_length': 15594.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6583.4765625, 'completions/min_terminated_length': 718.0, 'completions/max_terminated_length': 15594.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021393200382590294, 'sampling/sampling_logp_difference/max': 4.093823432922363, 'sampling/importance_sampling_ratio/min': 0.016675354912877083, 'sampling/importance_sampling_ratio/mean': 1.00004243850708, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.021921381354332, 'clip_ratio/low_mean': 3.661125703047219e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0171863777941326e-06, 'clip_ratio/high_max': 4.06874551117653e-06, 'clip_ratio/region_mean': 3.762844340826632e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 328/1024 [14:56:54<32:32:24, 168.31s/it][AINFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 329/1024 [15:00:04<33:46:42, 174.97s/it][A
+                                                         [A{'loss': 0.0805, 'grad_norm': 0.004189736675471067, 'learning_rate': 1e-05, 'num_tokens': 286935512.0, 'completions/mean_length': 7770.5859375, 'completions/min_length': 1040.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7346.97509765625, 'completions/min_terminated_length': 1040.0, 'completions/max_terminated_length': 16299.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2369818240404129, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021805983036756516, 'sampling/sampling_logp_difference/max': 4.449572563171387, 'sampling/importance_sampling_ratio/min': 0.011683559976518154, 'sampling/importance_sampling_ratio/mean': 0.9999797344207764, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0466903448104858, 'clip_ratio/low_mean': 4.05305947879242e-05, 'clip_ratio/low_min': 4.215567059873138e-06, 'clip_ratio/high_mean': 3.053812861253391e-06, 'clip_ratio/high_max': 1.2215251445013564e-05, 'clip_ratio/region_mean': 4.358440742180392e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 329/1024 [15:00:04<33:46:42, 174.97s/it][AINFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 330/1024 [15:02:30<32:04:21, 166.37s/it][A
+                                                         [A{'loss': 0.0635, 'grad_norm': 0.0032866497058421373, 'learning_rate': 1e-05, 'num_tokens': 287681943.0, 'completions/mean_length': 5689.2421875, 'completions/min_length': 1194.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5432.568359375, 'completions/min_terminated_length': 1194.0, 'completions/max_terminated_length': 15758.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.640625, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01789461076259613, 'sampling/sampling_logp_difference/max': 7.873311519622803, 'sampling/importance_sampling_ratio/min': 0.00038077132194302976, 'sampling/importance_sampling_ratio/mean': 0.999940812587738, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7778806164860725, 'clip_ratio/low_mean': 1.8177100628236076e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.141844553691044e-06, 'clip_ratio/high_max': 2.0567378214764176e-05, 'clip_ratio/region_mean': 2.3318944840866607e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 330/1024 [15:02:30<32:04:21, 166.37s/it][AINFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 331/1024 [15:04:52<30:36:47, 159.03s/it][A
+                                                         [A{'loss': 0.0746, 'grad_norm': 0.0023572889622300863, 'learning_rate': 1e-05, 'num_tokens': 288506735.0, 'completions/mean_length': 6288.1875, 'completions/min_length': 751.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6127.93701171875, 'completions/min_terminated_length': 751.0, 'completions/max_terminated_length': 13820.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3066929280757904, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017407266423106194, 'sampling/sampling_logp_difference/max': 7.749598503112793, 'sampling/importance_sampling_ratio/min': 0.000430915504693985, 'sampling/importance_sampling_ratio/mean': 0.9999474287033081, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7709921672940254, 'clip_ratio/low_mean': 3.1423560130861006e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.772717253828887e-06, 'clip_ratio/high_max': 3.109086901531555e-05, 'clip_ratio/region_mean': 3.919627738468989e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 331/1024 [15:04:52<30:36:47, 159.03s/it][AINFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 32%|███▏      | 332/1024 [15:07:39<31:01:48, 161.43s/it][A
+                                                         [A{'loss': 0.0986, 'grad_norm': 0.0034220058005303144, 'learning_rate': 1e-05, 'num_tokens': 289395498.0, 'completions/mean_length': 6775.0234375, 'completions/min_length': 655.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6465.05615234375, 'completions/min_terminated_length': 655.0, 'completions/max_terminated_length': 16318.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.34533774852752686, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019930530339479446, 'sampling/sampling_logp_difference/max': 3.449552536010742, 'sampling/importance_sampling_ratio/min': 0.0317598432302475, 'sampling/importance_sampling_ratio/mean': 0.9999603033065796, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9338318258523941, 'clip_ratio/low_mean': 6.26047980176736e-05, 'clip_ratio/low_min': 5.51267930859467e-06, 'clip_ratio/high_mean': 9.51674803673086e-06, 'clip_ratio/high_max': 3.4638953366084024e-05, 'clip_ratio/region_mean': 7.212154741864651e-05, 'epoch': 0.31}
+
+ 32%|███▏      | 332/1024 [15:07:39<31:01:48, 161.43s/it][AINFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 333/1024 [15:10:18<30:48:09, 160.48s/it][A
+                                                         [A{'loss': 0.0262, 'grad_norm': 0.002513247774913907, 'learning_rate': 1e-05, 'num_tokens': 290329082.0, 'completions/mean_length': 7142.9375, 'completions/min_length': 707.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6844.83837890625, 'completions/min_terminated_length': 707.0, 'completions/max_terminated_length': 15295.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022366533055901527, 'sampling/sampling_logp_difference/max': 14.969992637634277, 'sampling/importance_sampling_ratio/min': 3.152207455059397e-07, 'sampling/importance_sampling_ratio/mean': 0.9999737739562988, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.971405878663063, 'clip_ratio/low_mean': 7.159989991123439e-05, 'clip_ratio/low_min': 1.5592839645250933e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 7.159989991123439e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 333/1024 [15:10:18<30:48:09, 160.48s/it][AINFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 334/1024 [15:13:11<31:29:07, 164.27s/it][A
+                                                         [A{'loss': 0.0204, 'grad_norm': 0.0056767817586660385, 'learning_rate': 1e-05, 'num_tokens': 291170133.0, 'completions/mean_length': 6412.2109375, 'completions/min_length': 544.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6333.69287109375, 'completions/min_terminated_length': 544.0, 'completions/max_terminated_length': 15581.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.15650184452533722, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020012658089399338, 'sampling/sampling_logp_difference/max': 7.687117099761963, 'sampling/importance_sampling_ratio/min': 0.000458698661532253, 'sampling/importance_sampling_ratio/mean': 0.9999720454216003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9136044681072235, 'clip_ratio/low_mean': 1.7493430505055585e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.126938051740581e-06, 'clip_ratio/high_max': 1.6507752206962323e-05, 'clip_ratio/region_mean': 2.1620368215735652e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 334/1024 [15:13:11<31:29:07, 164.27s/it][AINFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 335/1024 [15:16:10<32:20:01, 168.94s/it][A
+                                                         [A{'loss': 0.0432, 'grad_norm': 0.00243841833434999, 'learning_rate': 1e-05, 'num_tokens': 292222082.0, 'completions/mean_length': 8066.1015625, 'completions/min_length': 497.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7797.7822265625, 'completions/min_terminated_length': 497.0, 'completions/max_terminated_length': 16111.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2688046097755432, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.023650091141462326, 'sampling/sampling_logp_difference/max': 9.374991416931152, 'sampling/importance_sampling_ratio/min': 8.481895929435268e-05, 'sampling/importance_sampling_ratio/mean': 0.9999664425849915, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0789504647254944, 'clip_ratio/low_mean': 3.6938338666914206e-05, 'clip_ratio/low_min': 5.699044777429663e-06, 'clip_ratio/high_mean': 2.0652136072385474e-06, 'clip_ratio/high_max': 8.26085442895419e-06, 'clip_ratio/region_mean': 3.900355193309224e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 335/1024 [15:16:10<32:20:01, 168.94s/it][AINFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 336/1024 [15:18:50<31:43:31, 166.00s/it][A
+                                                         [A{'loss': 0.0372, 'grad_norm': 0.0020856577903032303, 'learning_rate': 1e-05, 'num_tokens': 293115984.0, 'completions/mean_length': 6836.046875, 'completions/min_length': 785.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6606.896484375, 'completions/min_terminated_length': 785.0, 'completions/max_terminated_length': 15176.0, 'rewards/accuracy_reward/mean': 0.21875, 'rewards/accuracy_reward/std': 0.41502299904823303, 'reward': 0.21875, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022012067958712578, 'sampling/sampling_logp_difference/max': 10.488847732543945, 'sampling/importance_sampling_ratio/min': 2.784526441246271e-05, 'sampling/importance_sampling_ratio/mean': 0.9999911785125732, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.218759760260582, 'clip_ratio/low_mean': 1.9117383317279746e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.330013674305519e-06, 'clip_ratio/high_max': 5.320054697222076e-06, 'clip_ratio/region_mean': 2.0447396991585265e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 336/1024 [15:18:50<31:43:31, 166.00s/it][AINFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 337/1024 [15:21:57<32:52:51, 172.30s/it][A
+                                                         [A{'loss': 0.0354, 'grad_norm': 0.005163854919373989, 'learning_rate': 1e-05, 'num_tokens': 294099503.0, 'completions/mean_length': 7501.9921875, 'completions/min_length': 1237.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7140.9345703125, 'completions/min_terminated_length': 1237.0, 'completions/max_terminated_length': 15796.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.30904707312583923, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020813245326280594, 'sampling/sampling_logp_difference/max': 7.331547260284424, 'sampling/importance_sampling_ratio/min': 0.0006545600481331348, 'sampling/importance_sampling_ratio/mean': 0.9999276399612427, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8940394818782806, 'clip_ratio/low_mean': 4.6741323160404136e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.549717793153832e-06, 'clip_ratio/high_max': 2.5695502699818462e-05, 'clip_ratio/region_mean': 5.429104089671455e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 337/1024 [15:21:57<32:52:51, 172.30s/it][AINFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 338/1024 [15:24:54<33:07:29, 173.83s/it][A
+                                                         [A{'loss': 0.0963, 'grad_norm': 0.0029277894645929337, 'learning_rate': 1e-05, 'num_tokens': 295042105.0, 'completions/mean_length': 7204.828125, 'completions/min_length': 846.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6908.7255859375, 'completions/min_terminated_length': 846.0, 'completions/max_terminated_length': 16034.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.28801077604293823, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020187582820653915, 'sampling/sampling_logp_difference/max': 10.872637748718262, 'sampling/importance_sampling_ratio/min': 1.8970265955431387e-05, 'sampling/importance_sampling_ratio/mean': 1.0000677108764648, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9961872175335884, 'clip_ratio/low_mean': 4.5567895540443715e-05, 'clip_ratio/low_min': 4.458871444512624e-06, 'clip_ratio/high_mean': 9.45794374729303e-06, 'clip_ratio/high_max': 3.1606674838258186e-05, 'clip_ratio/region_mean': 5.502583962879726e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 338/1024 [15:24:54<33:07:29, 173.83s/it][AINFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 339/1024 [15:27:38<32:30:54, 170.88s/it][A
+                                                         [A{'loss': 0.0473, 'grad_norm': 0.0032952844630926847, 'learning_rate': 1e-05, 'num_tokens': 295867039.0, 'completions/mean_length': 6256.859375, 'completions/min_length': 1006.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6013.80810546875, 'completions/min_terminated_length': 1006.0, 'completions/max_terminated_length': 15856.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.24670752882957458, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019491540268063545, 'sampling/sampling_logp_difference/max': 9.434039115905762, 'sampling/importance_sampling_ratio/min': 7.995560008566827e-05, 'sampling/importance_sampling_ratio/mean': 0.9999649524688721, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9293600022792816, 'clip_ratio/low_mean': 1.8380221035840805e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.194059781388205e-06, 'clip_ratio/high_max': 1.7963964182854397e-05, 'clip_ratio/region_mean': 2.357428081722901e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 339/1024 [15:27:38<32:30:54, 170.88s/it][AINFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 340/1024 [15:30:30<32:33:21, 171.35s/it][A
+                                                         [A{'loss': 0.0621, 'grad_norm': 0.0029417150653898716, 'learning_rate': 1e-05, 'num_tokens': 296832843.0, 'completions/mean_length': 7397.84375, 'completions/min_length': 923.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7032.552734375, 'completions/min_terminated_length': 923.0, 'completions/max_terminated_length': 15412.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2867125868797302, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01975393109023571, 'sampling/sampling_logp_difference/max': 10.93724250793457, 'sampling/importance_sampling_ratio/min': 1.7783446310204454e-05, 'sampling/importance_sampling_ratio/mean': 1.0000183582305908, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8508890569210052, 'clip_ratio/low_mean': 2.7479814093567256e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8943877648780472e-06, 'clip_ratio/high_max': 7.577551059512189e-06, 'clip_ratio/region_mean': 2.9374201631071628e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 340/1024 [15:30:30<32:33:21, 171.35s/it][AINFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 341/1024 [15:33:07<31:39:28, 166.86s/it][A
+                                                         [A{'loss': 0.0664, 'grad_norm': 0.0026788609102368355, 'learning_rate': 1e-05, 'num_tokens': 297735285.0, 'completions/mean_length': 6897.765625, 'completions/min_length': 371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6823.07080078125, 'completions/min_terminated_length': 371.0, 'completions/max_terminated_length': 14983.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3266732692718506, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020578179508447647, 'sampling/sampling_logp_difference/max': 6.370794296264648, 'sampling/importance_sampling_ratio/min': 0.001710799871943891, 'sampling/importance_sampling_ratio/mean': 0.999909520149231, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9046694040298462, 'clip_ratio/low_mean': 5.109179869577929e-05, 'clip_ratio/low_min': 6.657612175331451e-06, 'clip_ratio/high_mean': 1.3302957199812226e-05, 'clip_ratio/high_max': 3.281225508544594e-05, 'clip_ratio/region_mean': 6.439475532715733e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 341/1024 [15:33:07<31:39:28, 166.86s/it][AINFO 12-02 04:58:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 342/1024 [15:36:11<32:34:06, 171.92s/it][A
+                                                         [A{'loss': 0.0923, 'grad_norm': 0.005915141198784113, 'learning_rate': 1e-05, 'num_tokens': 298645124.0, 'completions/mean_length': 6971.9921875, 'completions/min_length': 6.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6509.10595703125, 'completions/min_terminated_length': 6.0, 'completions/max_terminated_length': 15525.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3742823898792267, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01906151883304119, 'sampling/sampling_logp_difference/max': 6.937347412109375, 'sampling/importance_sampling_ratio/min': 0.000970841443631798, 'sampling/importance_sampling_ratio/mean': 0.9999268651008606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8658201694488525, 'clip_ratio/low_mean': 7.019768918326008e-05, 'clip_ratio/low_min': 2.541147478041239e-05, 'clip_ratio/high_mean': 5.168538336874917e-06, 'clip_ratio/high_max': 1.7319889593636617e-05, 'clip_ratio/region_mean': 7.53662266106403e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 342/1024 [15:36:11<32:34:06, 171.92s/it][AINFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 33%|███▎      | 343/1024 [15:38:55<32:06:03, 169.70s/it][A
+                                                         [A{'loss': 0.097, 'grad_norm': 0.0032792428974062204, 'learning_rate': 1e-05, 'num_tokens': 299503781.0, 'completions/mean_length': 6545.6953125, 'completions/min_length': 800.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5889.80859375, 'completions/min_terminated_length': 800.0, 'completions/max_terminated_length': 15054.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.38293448090553284, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017413027584552765, 'sampling/sampling_logp_difference/max': 6.124998092651367, 'sampling/importance_sampling_ratio/min': 0.002187495119869709, 'sampling/importance_sampling_ratio/mean': 0.9999361634254456, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.779609851539135, 'clip_ratio/low_mean': 6.167940273371642e-05, 'clip_ratio/low_min': 5.969151516183047e-06, 'clip_ratio/high_mean': 4.583216309583804e-06, 'clip_ratio/high_max': 1.8332865238335216e-05, 'clip_ratio/region_mean': 6.626261847486603e-05, 'epoch': 0.32}
+
+ 33%|███▎      | 343/1024 [15:38:55<32:06:03, 169.70s/it][AINFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▎      | 344/1024 [15:41:50<32:20:24, 171.21s/it][A
+                                                         [A{'loss': 0.1102, 'grad_norm': 0.005092279519885778, 'learning_rate': 1e-05, 'num_tokens': 300447903.0, 'completions/mean_length': 7226.515625, 'completions/min_length': 454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7006.736328125, 'completions/min_terminated_length': 454.0, 'completions/max_terminated_length': 15318.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2998581528663635, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021570835262537003, 'sampling/sampling_logp_difference/max': 7.374476909637451, 'sampling/importance_sampling_ratio/min': 0.000627054600045085, 'sampling/importance_sampling_ratio/mean': 0.9999373555183411, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9573849961161613, 'clip_ratio/low_mean': 4.46246323235755e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.46246323235755e-05, 'epoch': 0.32}
+
+ 34%|███▎      | 344/1024 [15:41:50<32:20:24, 171.21s/it][AINFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▎      | 345/1024 [15:44:30<31:41:16, 168.01s/it][A
+                                                         [A{'loss': 0.0655, 'grad_norm': 0.005033228080719709, 'learning_rate': 1e-05, 'num_tokens': 301206021.0, 'completions/mean_length': 5755.171875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5323.10546875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 14967.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3424547016620636, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018870476633310318, 'sampling/sampling_logp_difference/max': 6.531146049499512, 'sampling/importance_sampling_ratio/min': 0.0014573346124961972, 'sampling/importance_sampling_ratio/mean': 0.9999947547912598, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8482184633612633, 'clip_ratio/low_mean': 4.7280102080549113e-05, 'clip_ratio/low_min': 1.0166083029616857e-05, 'clip_ratio/high_mean': 1.3718173477172968e-06, 'clip_ratio/high_max': 5.487269390869187e-06, 'clip_ratio/region_mean': 4.865191931457957e-05, 'epoch': 0.32}
+
+ 34%|███▎      | 345/1024 [15:44:30<31:41:16, 168.01s/it][AINFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 346/1024 [15:47:19<31:41:58, 168.32s/it][A
+                                                         [A{'loss': 0.0707, 'grad_norm': 0.007659573573619127, 'learning_rate': 1e-05, 'num_tokens': 302133890.0, 'completions/mean_length': 7098.7265625, 'completions/min_length': 947.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6875.88037109375, 'completions/min_terminated_length': 947.0, 'completions/max_terminated_length': 15509.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.23410367965698242, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019994346424937248, 'sampling/sampling_logp_difference/max': 6.687288761138916, 'sampling/importance_sampling_ratio/min': 0.0012466582702472806, 'sampling/importance_sampling_ratio/mean': 1.0000004768371582, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.87320177257061, 'clip_ratio/low_mean': 1.6510994441887306e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3553367352869827e-06, 'clip_ratio/high_max': 5.421346941147931e-06, 'clip_ratio/region_mean': 1.786633117717429e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 346/1024 [15:47:19<31:41:58, 168.32s/it][AINFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 347/1024 [15:50:05<31:29:14, 167.44s/it][A
+                                                         [A{'loss': 0.0487, 'grad_norm': 0.0014135175151750445, 'learning_rate': 1e-05, 'num_tokens': 302972566.0, 'completions/mean_length': 6399.96875, 'completions/min_length': 364.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6077.90283203125, 'completions/min_terminated_length': 364.0, 'completions/max_terminated_length': 16139.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02073008380830288, 'sampling/sampling_logp_difference/max': 5.963917255401611, 'sampling/importance_sampling_ratio/min': 0.0025698256213217974, 'sampling/importance_sampling_ratio/mean': 0.9999452829360962, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9481896534562111, 'clip_ratio/low_mean': 3.8744643916288624e-05, 'clip_ratio/low_min': 6.108287834649673e-06, 'clip_ratio/high_mean': 2.8890573275930365e-06, 'clip_ratio/high_max': 1.1556229310372146e-05, 'clip_ratio/region_mean': 4.1633702039689524e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 347/1024 [15:50:05<31:29:14, 167.44s/it][AINFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 348/1024 [15:52:57<31:41:34, 168.78s/it][A
+                                                         [A{'loss': 0.0126, 'grad_norm': 0.0027898226398974657, 'learning_rate': 1e-05, 'num_tokens': 303925976.0, 'completions/mean_length': 7298.078125, 'completions/min_length': 1009.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7226.53564453125, 'completions/min_terminated_length': 1009.0, 'completions/max_terminated_length': 16095.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020944103598594666, 'sampling/sampling_logp_difference/max': 5.252114772796631, 'sampling/importance_sampling_ratio/min': 0.005236432887613773, 'sampling/importance_sampling_ratio/mean': 0.9999772310256958, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8719206526875496, 'clip_ratio/low_mean': 4.620846755187813e-05, 'clip_ratio/low_min': 6.243132702365983e-06, 'clip_ratio/high_mean': 2.545892130001448e-06, 'clip_ratio/high_max': 6.59491388432798e-06, 'clip_ratio/region_mean': 4.875435956819274e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 348/1024 [15:52:57<31:41:34, 168.78s/it][AINFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 349/1024 [15:55:24<30:25:36, 162.28s/it][A
+                                                         [A{'loss': 0.0248, 'grad_norm': 0.0012764945859089494, 'learning_rate': 1e-05, 'num_tokens': 304675157.0, 'completions/mean_length': 5667.0390625, 'completions/min_length': 974.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5496.9287109375, 'completions/min_terminated_length': 974.0, 'completions/max_terminated_length': 14980.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.17965976893901825, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018928447738289833, 'sampling/sampling_logp_difference/max': 12.195245742797852, 'sampling/importance_sampling_ratio/min': 5.054428584116977e-06, 'sampling/importance_sampling_ratio/mean': 1.0000383853912354, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8791451379656792, 'clip_ratio/low_mean': 2.010384196182713e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6303108597858227e-06, 'clip_ratio/high_max': 1.052124343914329e-05, 'clip_ratio/region_mean': 2.273415248055244e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 349/1024 [15:55:24<30:25:36, 162.28s/it][AINFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 350/1024 [15:57:41<28:58:05, 154.73s/it][A
+                                                         [A{'loss': 0.0221, 'grad_norm': 0.00509974779561162, 'learning_rate': 1e-05, 'num_tokens': 305447038.0, 'completions/mean_length': 5874.4453125, 'completions/min_length': 486.0, 'completions/max_length': 15354.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5874.4453125, 'completions/min_terminated_length': 486.0, 'completions/max_terminated_length': 15354.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.24777325987815857, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02114470861852169, 'sampling/sampling_logp_difference/max': 5.340880870819092, 'sampling/importance_sampling_ratio/min': 0.004791648127138615, 'sampling/importance_sampling_ratio/mean': 0.9999423027038574, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9577538818120956, 'clip_ratio/low_mean': 3.1114799753595435e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3945113980516908e-06, 'clip_ratio/high_max': 9.578045592206763e-06, 'clip_ratio/region_mean': 3.350931149270764e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 350/1024 [15:57:41<28:58:05, 154.73s/it][AINFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 351/1024 [16:00:17<29:00:01, 155.13s/it][A
+                                                         [A{'loss': 0.06, 'grad_norm': 0.0030849494505673647, 'learning_rate': 1e-05, 'num_tokens': 306258023.0, 'completions/mean_length': 6197.5703125, 'completions/min_length': 316.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6035.88134765625, 'completions/min_terminated_length': 316.0, 'completions/max_terminated_length': 15670.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.3748064339160919, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021017421036958694, 'sampling/sampling_logp_difference/max': 7.093727111816406, 'sampling/importance_sampling_ratio/min': 0.000830297009088099, 'sampling/importance_sampling_ratio/mean': 0.9998056888580322, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8665244281291962, 'clip_ratio/low_mean': 4.784364205079328e-05, 'clip_ratio/low_min': 3.861600362142781e-06, 'clip_ratio/high_mean': 2.7257655688117666e-06, 'clip_ratio/high_max': 1.0903062275247066e-05, 'clip_ratio/region_mean': 5.056940744907479e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 351/1024 [16:00:17<29:00:01, 155.13s/it][AINFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 352/1024 [16:03:08<29:51:20, 159.94s/it][A
+                                                         [A{'loss': 0.076, 'grad_norm': 0.002946985885500908, 'learning_rate': 1e-05, 'num_tokens': 307240305.0, 'completions/mean_length': 7522.578125, 'completions/min_length': 794.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7381.9208984375, 'completions/min_terminated_length': 794.0, 'completions/max_terminated_length': 16276.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01965932548046112, 'sampling/sampling_logp_difference/max': 5.273195743560791, 'sampling/importance_sampling_ratio/min': 0.005127199459820986, 'sampling/importance_sampling_ratio/mean': 0.9999547004699707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8185881152749062, 'clip_ratio/low_mean': 6.213493452378316e-05, 'clip_ratio/low_min': 1.0056635801447555e-05, 'clip_ratio/high_mean': 4.3520980170796975e-06, 'clip_ratio/high_max': 1.4299712574938894e-05, 'clip_ratio/region_mean': 6.648703174505499e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 352/1024 [16:03:08<29:51:20, 159.94s/it][AINFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 353/1024 [16:05:41<29:24:17, 157.76s/it][A
+                                                         [A{'loss': 0.072, 'grad_norm': 0.0031181599479168653, 'learning_rate': 1e-05, 'num_tokens': 308079318.0, 'completions/mean_length': 6403.2265625, 'completions/min_length': 552.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6163.6884765625, 'completions/min_terminated_length': 552.0, 'completions/max_terminated_length': 14090.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.27145031094551086, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01963040418922901, 'sampling/sampling_logp_difference/max': 9.605287551879883, 'sampling/importance_sampling_ratio/min': 6.73715621815063e-05, 'sampling/importance_sampling_ratio/mean': 0.9999215602874756, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8359840363264084, 'clip_ratio/low_mean': 4.2052345861520735e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.08456730585749e-06, 'clip_ratio/high_max': 1.693051035545068e-05, 'clip_ratio/region_mean': 4.713691282631771e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 353/1024 [16:05:41<29:24:17, 157.76s/it][AINFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 354/1024 [16:08:40<30:32:35, 164.11s/it][A
+                                                         [A{'loss': 0.1022, 'grad_norm': 0.002656223252415657, 'learning_rate': 1e-05, 'num_tokens': 309117770.0, 'completions/mean_length': 7954.03125, 'completions/min_length': 632.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7751.71240234375, 'completions/min_terminated_length': 632.0, 'completions/max_terminated_length': 16148.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020741507411003113, 'sampling/sampling_logp_difference/max': 7.999940395355225, 'sampling/importance_sampling_ratio/min': 0.0003354826185386628, 'sampling/importance_sampling_ratio/mean': 0.9999536275863647, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.905990719795227, 'clip_ratio/low_mean': 6.722658486069122e-05, 'clip_ratio/low_min': 1.858519090092159e-05, 'clip_ratio/high_mean': 3.497229783988587e-06, 'clip_ratio/high_max': 1.3988919135954347e-05, 'clip_ratio/region_mean': 7.072381458783639e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 354/1024 [16:08:40<30:32:35, 164.11s/it][AINFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 355/1024 [16:11:34<31:03:15, 167.11s/it][A
+                                                         [A{'loss': 0.0617, 'grad_norm': 0.0060529084876179695, 'learning_rate': 1e-05, 'num_tokens': 309988894.0, 'completions/mean_length': 6630.09375, 'completions/min_length': 375.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6315.45166015625, 'completions/min_terminated_length': 375.0, 'completions/max_terminated_length': 16272.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2790592312812805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02060208097100258, 'sampling/sampling_logp_difference/max': 10.716434478759766, 'sampling/importance_sampling_ratio/min': 2.2177453502081335e-05, 'sampling/importance_sampling_ratio/mean': 0.9998822212219238, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.870736837387085, 'clip_ratio/low_mean': 4.337988764291367e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.402648755785776e-06, 'clip_ratio/high_max': 1.7610595023143105e-05, 'clip_ratio/region_mean': 4.778253651238629e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 355/1024 [16:11:34<31:03:15, 167.11s/it][AINFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 05:38:09,170 - math_verify.grader - WARNING - Timeout during comparison
+
+ 35%|███▍      | 356/1024 [16:14:04<30:05:25, 162.16s/it][A
+                                                         [A{'loss': 0.0605, 'grad_norm': 0.00400698184967041, 'learning_rate': 1e-05, 'num_tokens': 310864013.0, 'completions/mean_length': 6679.6171875, 'completions/min_length': 611.0, 'completions/max_length': 15920.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6679.6171875, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 15920.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3295465111732483, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02098071575164795, 'sampling/sampling_logp_difference/max': 6.1853485107421875, 'sampling/importance_sampling_ratio/min': 0.0020593837834894657, 'sampling/importance_sampling_ratio/mean': 0.9999049305915833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9812518879771233, 'clip_ratio/low_mean': 3.1030769946482906e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6120233087567613e-06, 'clip_ratio/high_max': 1.0448093235027045e-05, 'clip_ratio/region_mean': 3.364279325523967e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 356/1024 [16:14:04<30:05:25, 162.16s/it][AINFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 357/1024 [16:16:39<29:36:07, 159.77s/it][A
+                                                         [A{'loss': 0.0362, 'grad_norm': 0.005929585546255112, 'learning_rate': 1e-05, 'num_tokens': 311589987.0, 'completions/mean_length': 5523.796875, 'completions/min_length': 633.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5173.4677734375, 'completions/min_terminated_length': 633.0, 'completions/max_terminated_length': 14541.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019948206841945648, 'sampling/sampling_logp_difference/max': 6.843744277954102, 'sampling/importance_sampling_ratio/min': 0.0010661041596904397, 'sampling/importance_sampling_ratio/mean': 0.9998446702957153, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9120645374059677, 'clip_ratio/low_mean': 2.900951585615985e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.736592579021817e-06, 'clip_ratio/high_max': 2.124982574969181e-05, 'clip_ratio/region_mean': 3.674610888992902e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 357/1024 [16:16:39<29:36:07, 159.77s/it][AINFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 358/1024 [16:19:07<28:54:07, 156.23s/it][A
+                                                         [A{'loss': 0.1023, 'grad_norm': 0.006622390355914831, 'learning_rate': 1e-05, 'num_tokens': 312424034.0, 'completions/mean_length': 6361.3671875, 'completions/min_length': 432.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6282.44873046875, 'completions/min_terminated_length': 432.0, 'completions/max_terminated_length': 15401.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3724474310874939, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018907658755779266, 'sampling/sampling_logp_difference/max': 8.060688972473145, 'sampling/importance_sampling_ratio/min': 0.0003157092141918838, 'sampling/importance_sampling_ratio/mean': 1.0000219345092773, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8044678047299385, 'clip_ratio/low_mean': 5.346400575945154e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.219769085826556e-06, 'clip_ratio/high_max': 2.4486997745043482e-05, 'clip_ratio/region_mean': 6.168377467474784e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 358/1024 [16:19:07<28:54:07, 156.23s/it][AINFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 359/1024 [16:21:47<29:03:51, 157.34s/it][A
+                                                         [A{'loss': 0.0861, 'grad_norm': 0.004639944992959499, 'learning_rate': 1e-05, 'num_tokens': 313353346.0, 'completions/mean_length': 7109.0, 'completions/min_length': 611.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7035.96826171875, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 15883.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3826971650123596, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02155841514468193, 'sampling/sampling_logp_difference/max': 6.262202262878418, 'sampling/importance_sampling_ratio/min': 0.0019070414127781987, 'sampling/importance_sampling_ratio/mean': 0.9999389052391052, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9167275875806808, 'clip_ratio/low_mean': 5.925514369664597e-05, 'clip_ratio/low_min': 1.3324347946763737e-05, 'clip_ratio/high_mean': 2.6018441872110998e-06, 'clip_ratio/high_max': 1.0407376748844399e-05, 'clip_ratio/region_mean': 6.185698703120579e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 359/1024 [16:21:47<29:03:51, 157.34s/it][AINFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 360/1024 [16:24:35<29:37:36, 160.63s/it][A
+                                                         [A{'loss': 0.0873, 'grad_norm': 0.007643720600754023, 'learning_rate': 1e-05, 'num_tokens': 314180717.0, 'completions/mean_length': 6314.2734375, 'completions/min_length': 665.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6072.60009765625, 'completions/min_terminated_length': 665.0, 'completions/max_terminated_length': 15795.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.28117600083351135, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01964358240365982, 'sampling/sampling_logp_difference/max': 3.8497378826141357, 'sampling/importance_sampling_ratio/min': 0.021285315975546837, 'sampling/importance_sampling_ratio/mean': 0.9999802112579346, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8780038207769394, 'clip_ratio/low_mean': 3.3944450819944905e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0912523691786191e-05, 'clip_ratio/high_max': 3.959046694035351e-05, 'clip_ratio/region_mean': 4.485697365907981e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 360/1024 [16:24:35<29:37:36, 160.63s/it][AINFO 12-02 05:49:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:49:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:49:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:49:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 361/1024 [16:27:25<30:06:26, 163.48s/it][A
+                                                         [A{'loss': 0.0109, 'grad_norm': 0.0050973957404494286, 'learning_rate': 1e-05, 'num_tokens': 315060842.0, 'completions/mean_length': 6718.2265625, 'completions/min_length': 505.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6486.24853515625, 'completions/min_terminated_length': 505.0, 'completions/max_terminated_length': 16167.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3521803915500641, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019501537084579468, 'sampling/sampling_logp_difference/max': 6.998699188232422, 'sampling/importance_sampling_ratio/min': 0.0009130688849836588, 'sampling/importance_sampling_ratio/mean': 1.000014066696167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8326799497008324, 'clip_ratio/low_mean': 4.137891801292426e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.187473835936544e-06, 'clip_ratio/high_max': 3.065382111344661e-05, 'clip_ratio/region_mean': 5.056639065514901e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 361/1024 [16:27:25<30:06:26, 163.48s/it][AINFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 362/1024 [16:30:26<31:02:10, 168.78s/it][A
+                                                         [A{'loss': 0.036, 'grad_norm': 0.0019092690199613571, 'learning_rate': 1e-05, 'num_tokens': 316190325.0, 'completions/mean_length': 8666.8359375, 'completions/min_length': 565.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7941.291015625, 'completions/min_terminated_length': 565.0, 'completions/max_terminated_length': 16128.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.2022808939218521, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02051631174981594, 'sampling/sampling_logp_difference/max': 10.249995231628418, 'sampling/importance_sampling_ratio/min': 3.5357668821234256e-05, 'sampling/importance_sampling_ratio/mean': 0.9999814629554749, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9526705741882324, 'clip_ratio/low_mean': 1.8797969062234188e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.156213440684951e-06, 'clip_ratio/high_max': 8.624853762739804e-06, 'clip_ratio/region_mean': 2.0954182048171788e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 362/1024 [16:30:26<31:02:10, 168.78s/it][AINFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 363/1024 [16:33:23<31:27:20, 171.32s/it][A
+                                                         [A{'loss': 0.0729, 'grad_norm': 0.0019530428107827902, 'learning_rate': 1e-05, 'num_tokens': 317191878.0, 'completions/mean_length': 7661.8203125, 'completions/min_length': 649.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7002.16015625, 'completions/min_terminated_length': 649.0, 'completions/max_terminated_length': 15164.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.21382391452789307, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019894573837518692, 'sampling/sampling_logp_difference/max': 9.367389678955078, 'sampling/importance_sampling_ratio/min': 8.546619210392237e-05, 'sampling/importance_sampling_ratio/mean': 0.9999173879623413, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8322782590985298, 'clip_ratio/low_mean': 3.521234066283796e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.434908300434472e-06, 'clip_ratio/high_max': 2.147400391550036e-05, 'clip_ratio/region_mean': 4.164724816746457e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 363/1024 [16:33:23<31:27:20, 171.32s/it][AINFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 364/1024 [16:36:26<32:01:19, 174.67s/it][A
+                                                         [A{'loss': 0.0391, 'grad_norm': 0.0031784537713974714, 'learning_rate': 1e-05, 'num_tokens': 318109004.0, 'completions/mean_length': 7024.859375, 'completions/min_length': 693.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6800.240234375, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 15934.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.31800347566604614, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018519200384616852, 'sampling/sampling_logp_difference/max': 8.124353408813477, 'sampling/importance_sampling_ratio/min': 0.0002962362195830792, 'sampling/importance_sampling_ratio/mean': 0.9999352693557739, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.794853538274765, 'clip_ratio/low_mean': 4.2698405422925134e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.089704697842535e-06, 'clip_ratio/high_max': 1.9436202364886412e-05, 'clip_ratio/region_mean': 4.878810955233348e-05, 'epoch': 0.33}
+
+ 36%|███▌      | 364/1024 [16:36:26<32:01:19, 174.67s/it][AINFO 12-02 06:01:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:01:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:01:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:01:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 365/1024 [16:39:09<31:21:59, 171.35s/it][A
+                                                         [A{'loss': 0.041, 'grad_norm': 0.005080109462141991, 'learning_rate': 1e-05, 'num_tokens': 319059075.0, 'completions/mean_length': 7282.1796875, 'completions/min_length': 870.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6912.1865234375, 'completions/min_terminated_length': 870.0, 'completions/max_terminated_length': 15624.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019978653639554977, 'sampling/sampling_logp_difference/max': 6.136754989624023, 'sampling/importance_sampling_ratio/min': 0.1194523349404335, 'sampling/importance_sampling_ratio/mean': 1.000062108039856, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.904067650437355, 'clip_ratio/low_mean': 4.342453667049995e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0318639169781818e-06, 'clip_ratio/high_max': 4.127455667912727e-06, 'clip_ratio/region_mean': 4.445640047379129e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 365/1024 [16:39:09<31:21:59, 171.35s/it][AINFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 366/1024 [16:42:21<32:24:49, 177.34s/it][A
+                                                         [A{'loss': 0.0699, 'grad_norm': 0.0022667953744530678, 'learning_rate': 1e-05, 'num_tokens': 319990046.0, 'completions/mean_length': 7131.5234375, 'completions/min_length': 373.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6596.255859375, 'completions/min_terminated_length': 373.0, 'completions/max_terminated_length': 15625.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.30221715569496155, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02037571743130684, 'sampling/sampling_logp_difference/max': 3.294381618499756, 'sampling/importance_sampling_ratio/min': 0.0370909757912159, 'sampling/importance_sampling_ratio/mean': 0.9999264478683472, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8849587142467499, 'clip_ratio/low_mean': 2.608940076243016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.608940076243016e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 366/1024 [16:42:21<32:24:49, 177.34s/it][AINFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 367/1024 [16:45:11<31:56:51, 175.06s/it][A
+                                                         [A{'loss': 0.0681, 'grad_norm': 0.00216497085057199, 'learning_rate': 1e-05, 'num_tokens': 320860135.0, 'completions/mean_length': 6655.4453125, 'completions/min_length': 378.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6578.84228515625, 'completions/min_terminated_length': 378.0, 'completions/max_terminated_length': 16205.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.3369230031967163, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01771342009305954, 'sampling/sampling_logp_difference/max': 7.563511371612549, 'sampling/importance_sampling_ratio/min': 0.0005190494703128934, 'sampling/importance_sampling_ratio/mean': 0.9999319314956665, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7417122721672058, 'clip_ratio/low_mean': 3.4950805911648786e-05, 'clip_ratio/low_min': 4.876336333836662e-06, 'clip_ratio/high_mean': 3.839158978280466e-06, 'clip_ratio/high_max': 1.5356635913121863e-05, 'clip_ratio/region_mean': 3.8789965287833184e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 367/1024 [16:45:11<31:56:51, 175.06s/it][AINFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 368/1024 [16:47:58<31:29:13, 172.79s/it][A
+                                                         [A{'loss': 0.0643, 'grad_norm': 0.0028338562697172165, 'learning_rate': 1e-05, 'num_tokens': 321783852.0, 'completions/mean_length': 7077.1640625, 'completions/min_length': 26.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6619.45068359375, 'completions/min_terminated_length': 26.0, 'completions/max_terminated_length': 15849.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020299233496189117, 'sampling/sampling_logp_difference/max': 11.757177352905273, 'sampling/importance_sampling_ratio/min': 7.83290306571871e-06, 'sampling/importance_sampling_ratio/mean': 0.9998220205307007, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8749325424432755, 'clip_ratio/low_mean': 5.688933060810086e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.297029474604642e-06, 'clip_ratio/high_max': 1.7605634639039636e-05, 'clip_ratio/region_mean': 6.218636053745286e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 368/1024 [16:47:58<31:29:13, 172.79s/it][AINFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 369/1024 [16:50:37<30:40:50, 168.63s/it][A
+                                                         [A{'loss': 0.0275, 'grad_norm': 0.0022897711023688316, 'learning_rate': 1e-05, 'num_tokens': 322572882.0, 'completions/mean_length': 6034.296875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5525.294921875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15329.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2756394147872925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01858348958194256, 'sampling/sampling_logp_difference/max': 7.7979736328125, 'sampling/importance_sampling_ratio/min': 0.0004105660773348063, 'sampling/importance_sampling_ratio/mean': 0.9999347925186157, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.80014718323946, 'clip_ratio/low_mean': 5.158197632226802e-05, 'clip_ratio/low_min': 3.735804057214409e-06, 'clip_ratio/high_mean': 1.8254570477438392e-06, 'clip_ratio/high_max': 7.301828190975357e-06, 'clip_ratio/region_mean': 5.340743223314348e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 369/1024 [16:50:37<30:40:50, 168.63s/it][AINFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 370/1024 [16:53:25<30:37:29, 168.58s/it][A
+                                                         [A{'loss': 0.0356, 'grad_norm': 0.003263789461925626, 'learning_rate': 1e-05, 'num_tokens': 323640904.0, 'completions/mean_length': 8172.109375, 'completions/min_length': 733.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7838.29248046875, 'completions/min_terminated_length': 733.0, 'completions/max_terminated_length': 15948.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.3237774670124054, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0208889190107584, 'sampling/sampling_logp_difference/max': 11.588455200195312, 'sampling/importance_sampling_ratio/min': 9.27252222027164e-06, 'sampling/importance_sampling_ratio/mean': 0.9999354481697083, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8732693120837212, 'clip_ratio/low_mean': 4.186752630630508e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.351393047523743e-06, 'clip_ratio/high_max': 9.364057859784225e-06, 'clip_ratio/region_mean': 4.5218919240141986e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 370/1024 [16:53:25<30:37:29, 168.58s/it][AINFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▌      | 371/1024 [16:56:28<31:20:32, 172.79s/it][A
+                                                         [A{'loss': 0.0937, 'grad_norm': 0.0042716520838439465, 'learning_rate': 1e-05, 'num_tokens': 324643858.0, 'completions/mean_length': 7699.203125, 'completions/min_length': 1225.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7419.04833984375, 'completions/min_terminated_length': 1225.0, 'completions/max_terminated_length': 16228.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3090519607067108, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018926654011011124, 'sampling/sampling_logp_difference/max': 8.413164138793945, 'sampling/importance_sampling_ratio/min': 0.00022192654432728887, 'sampling/importance_sampling_ratio/mean': 0.9999874234199524, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8296505436301231, 'clip_ratio/low_mean': 4.261424010110204e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.692962131182867e-06, 'clip_ratio/high_max': 2.0998899799451465e-05, 'clip_ratio/region_mean': 4.930720297124935e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 371/1024 [16:56:28<31:20:32, 172.79s/it][AINFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▋      | 372/1024 [16:59:17<31:06:11, 171.74s/it][A
+                                                         [A{'loss': 0.0424, 'grad_norm': 0.0033558050636202097, 'learning_rate': 1e-05, 'num_tokens': 325617687.0, 'completions/mean_length': 7450.1640625, 'completions/min_length': 910.0, 'completions/max_length': 16364.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7450.1640625, 'completions/min_terminated_length': 910.0, 'completions/max_terminated_length': 16364.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02249298244714737, 'sampling/sampling_logp_difference/max': 3.2208595275878906, 'sampling/importance_sampling_ratio/min': 0.039920732378959656, 'sampling/importance_sampling_ratio/mean': 0.9999459385871887, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0400195196270943, 'clip_ratio/low_mean': 4.5005243464402156e-05, 'clip_ratio/low_min': 3.861838649754645e-06, 'clip_ratio/high_mean': 1.765337287906732e-06, 'clip_ratio/high_max': 7.061349151626928e-06, 'clip_ratio/region_mean': 4.6770580411248375e-05, 'epoch': 0.34}
+
+ 36%|███▋      | 372/1024 [16:59:17<31:06:11, 171.74s/it][AINFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 36%|███▋      | 373/1024 [17:01:52<30:07:56, 166.63s/it][A
+                                                         [A{'loss': 0.0476, 'grad_norm': 0.005797459278255701, 'learning_rate': 1e-05, 'num_tokens': 326508384.0, 'completions/mean_length': 6799.0703125, 'completions/min_length': 1708.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6723.5986328125, 'completions/min_terminated_length': 1708.0, 'completions/max_terminated_length': 15342.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021543748676776886, 'sampling/sampling_logp_difference/max': 14.0984525680542, 'sampling/importance_sampling_ratio/min': 7.535634836131067e-07, 'sampling/importance_sampling_ratio/mean': 0.9999321699142456, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9737623482942581, 'clip_ratio/low_mean': 2.4451034505545977e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2869506867427845e-06, 'clip_ratio/high_max': 1.3147802746971138e-05, 'clip_ratio/region_mean': 2.7737984851228248e-05, 'epoch': 0.34}
+
+ 36%|███▋      | 373/1024 [17:01:52<30:07:56, 166.63s/it][AINFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 374/1024 [17:04:39<30:07:25, 166.84s/it][A
+                                                         [A{'loss': 0.0389, 'grad_norm': 0.002258980879560113, 'learning_rate': 1e-05, 'num_tokens': 327426407.0, 'completions/mean_length': 7034.3671875, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6654.30078125, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 16102.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01997346058487892, 'sampling/sampling_logp_difference/max': 4.742221832275391, 'sampling/importance_sampling_ratio/min': 0.008719252422451973, 'sampling/importance_sampling_ratio/mean': 0.9999661445617676, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8749603256583214, 'clip_ratio/low_mean': 2.3457610382138228e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.398505997320171e-07, 'clip_ratio/high_max': 3.3594023989280686e-06, 'clip_ratio/region_mean': 2.4297460981870245e-05, 'epoch': 0.34}
+
+ 37%|███▋      | 374/1024 [17:04:39<30:07:25, 166.84s/it][AINFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 375/1024 [17:07:15<29:29:56, 163.63s/it][A
+                                                         [A{'loss': 0.076, 'grad_norm': 0.002420129720121622, 'learning_rate': 1e-05, 'num_tokens': 328292985.0, 'completions/mean_length': 6623.078125, 'completions/min_length': 569.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6388.81640625, 'completions/min_terminated_length': 569.0, 'completions/max_terminated_length': 15240.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3077537417411804, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019645996391773224, 'sampling/sampling_logp_difference/max': 8.811544418334961, 'sampling/importance_sampling_ratio/min': 0.00014900295354891568, 'sampling/importance_sampling_ratio/mean': 0.9998596906661987, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.858784057199955, 'clip_ratio/low_mean': 4.9395109726901865e-05, 'clip_ratio/low_min': 1.636556044104509e-05, 'clip_ratio/high_mean': 7.058438370677322e-06, 'clip_ratio/high_max': 2.823375348270929e-05, 'clip_ratio/region_mean': 5.6453548268109444e-05, 'epoch': 0.34}
+
+ 37%|███▋      | 375/1024 [17:07:15<29:29:56, 163.63s/it][AINFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 376/1024 [17:09:54<29:11:51, 162.21s/it][A
+                                                         [A{'loss': 0.0824, 'grad_norm': 0.004107976797968149, 'learning_rate': 1e-05, 'num_tokens': 329067006.0, 'completions/mean_length': 5902.4765625, 'completions/min_length': 574.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5564.36279296875, 'completions/min_terminated_length': 574.0, 'completions/max_terminated_length': 15229.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3945493996143341, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019582755863666534, 'sampling/sampling_logp_difference/max': 11.37439250946045, 'sampling/importance_sampling_ratio/min': 1.1485875802463852e-05, 'sampling/importance_sampling_ratio/mean': 0.9999526143074036, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.904740035533905, 'clip_ratio/low_mean': 4.051302585139638e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.468551191574079e-06, 'clip_ratio/high_max': 1.8078507309837732e-05, 'clip_ratio/region_mean': 4.698157727034413e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 376/1024 [17:09:54<29:11:51, 162.21s/it][AINFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 377/1024 [17:12:33<28:58:11, 161.19s/it][A
+                                                         [A{'loss': 0.0164, 'grad_norm': 0.003208522219210863, 'learning_rate': 1e-05, 'num_tokens': 329910691.0, 'completions/mean_length': 6425.6015625, 'completions/min_length': 557.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6267.5322265625, 'completions/min_terminated_length': 557.0, 'completions/max_terminated_length': 14514.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021154657006263733, 'sampling/sampling_logp_difference/max': 6.588794231414795, 'sampling/importance_sampling_ratio/min': 0.00137569778598845, 'sampling/importance_sampling_ratio/mean': 0.9999419450759888, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.964553713798523, 'clip_ratio/low_mean': 1.7552573126522475e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.276365181496658e-06, 'clip_ratio/high_max': 2.553658168835682e-05, 'clip_ratio/region_mean': 2.482893796695862e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 377/1024 [17:12:33<28:58:11, 161.19s/it][AINFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 378/1024 [17:15:32<29:51:26, 166.39s/it][A
+                                                         [A{'loss': 0.0815, 'grad_norm': 0.002898421371355653, 'learning_rate': 1e-05, 'num_tokens': 330956332.0, 'completions/mean_length': 8006.4453125, 'completions/min_length': 1235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7594.43408203125, 'completions/min_terminated_length': 1235.0, 'completions/max_terminated_length': 15797.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.20175684988498688, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021021340042352676, 'sampling/sampling_logp_difference/max': 9.27452278137207, 'sampling/importance_sampling_ratio/min': 9.378339746035635e-05, 'sampling/importance_sampling_ratio/mean': 0.9998818635940552, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8980336412787437, 'clip_ratio/low_mean': 4.0991827404468495e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7178105053972104e-06, 'clip_ratio/high_max': 6.8712420215888415e-06, 'clip_ratio/region_mean': 4.2709637853022286e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 378/1024 [17:15:32<29:51:26, 166.39s/it][AINFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 379/1024 [17:18:11<29:25:21, 164.22s/it][A
+                                                         [A{'loss': 0.0313, 'grad_norm': 0.0037063576746731997, 'learning_rate': 1e-05, 'num_tokens': 331880918.0, 'completions/mean_length': 7068.828125, 'completions/min_length': 791.0, 'completions/max_length': 15484.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7068.828125, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 15484.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.17859892547130585, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02072504535317421, 'sampling/sampling_logp_difference/max': 8.611893653869629, 'sampling/importance_sampling_ratio/min': 0.0001819290773710236, 'sampling/importance_sampling_ratio/mean': 0.9999452829360962, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9865007549524307, 'clip_ratio/low_mean': 2.2689344689297286e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.2689344689297286e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 379/1024 [17:18:11<29:25:21, 164.22s/it][AINFO 12-02 06:43:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:43:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:43:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:43:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 380/1024 [17:21:07<30:00:25, 167.74s/it][A
+                                                         [A{'loss': 0.0228, 'grad_norm': 0.001972826896235347, 'learning_rate': 1e-05, 'num_tokens': 332849112.0, 'completions/mean_length': 7379.390625, 'completions/min_length': 738.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7236.4609375, 'completions/min_terminated_length': 738.0, 'completions/max_terminated_length': 16281.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.28247418999671936, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019411223009228706, 'sampling/sampling_logp_difference/max': 10.476028442382812, 'sampling/importance_sampling_ratio/min': 2.820451663865242e-05, 'sampling/importance_sampling_ratio/mean': 0.999925971031189, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8977236375212669, 'clip_ratio/low_mean': 3.207486906831036e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4614083170272352e-06, 'clip_ratio/high_max': 5.845633268108941e-06, 'clip_ratio/region_mean': 3.353627721480734e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 380/1024 [17:21:07<30:00:25, 167.74s/it][AINFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 381/1024 [17:23:46<29:30:44, 165.23s/it][A
+                                                         [A{'loss': 0.0495, 'grad_norm': 0.006926023401319981, 'learning_rate': 1e-05, 'num_tokens': 333746179.0, 'completions/mean_length': 6867.9609375, 'completions/min_length': 760.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6793.03125, 'completions/min_terminated_length': 760.0, 'completions/max_terminated_length': 15517.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.1433562934398651, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.020311862230300903, 'sampling/sampling_logp_difference/max': 7.8556413650512695, 'sampling/importance_sampling_ratio/min': 0.0003875594411510974, 'sampling/importance_sampling_ratio/mean': 0.9999299645423889, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9244343340396881, 'clip_ratio/low_mean': 2.3530714997832547e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2188462505946518e-06, 'clip_ratio/high_max': 4.875385002378607e-06, 'clip_ratio/region_mean': 2.47495612484272e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 381/1024 [17:23:46<29:30:44, 165.23s/it][AINFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 382/1024 [17:26:35<29:38:12, 166.19s/it][A
+                                                         [A{'loss': 0.0808, 'grad_norm': 0.0047226278111338615, 'learning_rate': 1e-05, 'num_tokens': 334731027.0, 'completions/mean_length': 7525.375, 'completions/min_length': 654.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6855.3955078125, 'completions/min_terminated_length': 654.0, 'completions/max_terminated_length': 15900.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3353874683380127, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021496692672371864, 'sampling/sampling_logp_difference/max': 8.119979858398438, 'sampling/importance_sampling_ratio/min': 0.00029753465787507594, 'sampling/importance_sampling_ratio/mean': 0.9999615550041199, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9207312315702438, 'clip_ratio/low_mean': 5.268017821435933e-05, 'clip_ratio/low_min': 3.950945028918795e-06, 'clip_ratio/high_mean': 4.836261211949022e-06, 'clip_ratio/high_max': 1.5651628245905158e-05, 'clip_ratio/region_mean': 5.751643902840442e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 382/1024 [17:26:35<29:38:12, 166.19s/it][AINFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 383/1024 [17:29:48<31:01:15, 174.22s/it][A
+                                                         [A{'loss': 0.0126, 'grad_norm': 0.004971448332071304, 'learning_rate': 1e-05, 'num_tokens': 335631243.0, 'completions/mean_length': 6841.0625, 'completions/min_length': 689.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6453.13818359375, 'completions/min_terminated_length': 689.0, 'completions/max_terminated_length': 16251.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2596156895160675, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020256079733371735, 'sampling/sampling_logp_difference/max': 11.547955513000488, 'sampling/importance_sampling_ratio/min': 9.655764188210014e-06, 'sampling/importance_sampling_ratio/mean': 0.999934196472168, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8979457840323448, 'clip_ratio/low_mean': 4.519663821156428e-05, 'clip_ratio/low_min': 2.775434040813707e-06, 'clip_ratio/high_mean': 9.53844971718354e-06, 'clip_ratio/high_max': 3.815379886873416e-05, 'clip_ratio/region_mean': 5.473508826980833e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 383/1024 [17:29:48<31:01:15, 174.22s/it][AINFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 384/1024 [17:32:49<31:22:32, 176.49s/it][A
+                                                         [A{'loss': 0.0262, 'grad_norm': 0.0038604787550866604, 'learning_rate': 1e-05, 'num_tokens': 336537162.0, 'completions/mean_length': 6919.8046875, 'completions/min_length': 896.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6454.35205078125, 'completions/min_terminated_length': 896.0, 'completions/max_terminated_length': 15060.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02030865103006363, 'sampling/sampling_logp_difference/max': 6.999982833862305, 'sampling/importance_sampling_ratio/min': 0.0009118975722230971, 'sampling/importance_sampling_ratio/mean': 0.9998080730438232, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9241961911320686, 'clip_ratio/low_mean': 3.1563491688757495e-05, 'clip_ratio/low_min': 3.1228139505401487e-06, 'clip_ratio/high_mean': 1.0405914281363948e-06, 'clip_ratio/high_max': 4.162365712545579e-06, 'clip_ratio/region_mean': 3.260408311689389e-05, 'epoch': 0.35}
+
+ 38%|███▊      | 384/1024 [17:32:49<31:22:32, 176.49s/it][AINFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 385/1024 [17:36:02<32:11:34, 181.37s/it][A
+                                                         [A{'loss': 0.0849, 'grad_norm': 0.004624314606189728, 'learning_rate': 1e-05, 'num_tokens': 337542492.0, 'completions/mean_length': 7679.390625, 'completions/min_length': 105.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7099.08349609375, 'completions/min_terminated_length': 105.0, 'completions/max_terminated_length': 15692.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2517249882221222, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02206476218998432, 'sampling/sampling_logp_difference/max': 9.748971939086914, 'sampling/importance_sampling_ratio/min': 5.83546279813163e-05, 'sampling/importance_sampling_ratio/mean': 0.9999251961708069, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0165777206420898, 'clip_ratio/low_mean': 4.3847362121596234e-05, 'clip_ratio/low_min': 6.294533704931382e-06, 'clip_ratio/high_mean': 1.6295562090817839e-06, 'clip_ratio/high_max': 6.5182248363271356e-06, 'clip_ratio/region_mean': 4.547691833067802e-05, 'epoch': 0.35}
+
+ 38%|███▊      | 385/1024 [17:36:02<32:11:34, 181.37s/it][AINFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 386/1024 [17:38:35<30:36:12, 172.68s/it][A
+                                                         [A{'loss': 0.0789, 'grad_norm': 0.0021966886706650257, 'learning_rate': 1e-05, 'num_tokens': 338324279.0, 'completions/mean_length': 5957.5859375, 'completions/min_length': 1705.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5792.08740234375, 'completions/min_terminated_length': 1705.0, 'completions/max_terminated_length': 15819.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01804077997803688, 'sampling/sampling_logp_difference/max': 7.125762462615967, 'sampling/importance_sampling_ratio/min': 0.0008041196851991117, 'sampling/importance_sampling_ratio/mean': 0.999998927116394, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7705951780080795, 'clip_ratio/low_mean': 3.392923713363416e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5012490166554926e-06, 'clip_ratio/high_max': 6.00499606662197e-06, 'clip_ratio/region_mean': 3.543048615028965e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 386/1024 [17:38:35<30:36:12, 172.68s/it][AINFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 387/1024 [17:41:11<29:41:39, 167.82s/it][A
+                                                         [A{'loss': 0.134, 'grad_norm': 0.001694107661023736, 'learning_rate': 1e-05, 'num_tokens': 339274662.0, 'completions/mean_length': 7269.8046875, 'completions/min_length': 892.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7198.03955078125, 'completions/min_terminated_length': 892.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.30487072467803955, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021742526441812515, 'sampling/sampling_logp_difference/max': 6.4581451416015625, 'sampling/importance_sampling_ratio/min': 0.0015677008777856827, 'sampling/importance_sampling_ratio/mean': 0.9999039769172668, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0025205165147781, 'clip_ratio/low_mean': 5.276240381135722e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.927837553874269e-06, 'clip_ratio/high_max': 1.5711350215497077e-05, 'clip_ratio/region_mean': 5.669024130838807e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 387/1024 [17:41:11<29:41:39, 167.82s/it][AINFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 388/1024 [17:44:10<30:15:10, 171.24s/it][A
+                                                         [A{'loss': 0.0691, 'grad_norm': 0.004587972536683083, 'learning_rate': 1e-05, 'num_tokens': 340272689.0, 'completions/mean_length': 7643.8359375, 'completions/min_length': 1061.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7288.54443359375, 'completions/min_terminated_length': 1061.0, 'completions/max_terminated_length': 15755.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.35324612259864807, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01862112432718277, 'sampling/sampling_logp_difference/max': 7.210168361663818, 'sampling/importance_sampling_ratio/min': 0.0007390327518805861, 'sampling/importance_sampling_ratio/mean': 0.9999613761901855, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7936615869402885, 'clip_ratio/low_mean': 5.100632029098051e-05, 'clip_ratio/low_min': 8.934973720897688e-06, 'clip_ratio/high_mean': 1.7514622072667407e-06, 'clip_ratio/high_max': 7.005848829066963e-06, 'clip_ratio/region_mean': 5.275778244140383e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 388/1024 [17:44:10<30:15:10, 171.24s/it][AINFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 389/1024 [17:46:58<30:02:05, 170.28s/it][A
+                                                         [A{'loss': 0.0636, 'grad_norm': 0.00245783943682909, 'learning_rate': 1e-05, 'num_tokens': 341195599.0, 'completions/mean_length': 7068.734375, 'completions/min_length': 775.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6610.60595703125, 'completions/min_terminated_length': 775.0, 'completions/max_terminated_length': 14401.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.21594557166099548, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019989900290966034, 'sampling/sampling_logp_difference/max': 11.090067863464355, 'sampling/importance_sampling_ratio/min': 1.526316918898374e-05, 'sampling/importance_sampling_ratio/mean': 0.999957263469696, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8858344480395317, 'clip_ratio/low_mean': 2.139122614153166e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6306840936740628e-06, 'clip_ratio/high_max': 1.0522736374696251e-05, 'clip_ratio/region_mean': 2.4021910121518886e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 389/1024 [17:46:58<30:02:05, 170.28s/it][AINFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 390/1024 [17:49:42<29:38:40, 168.33s/it][A
+                                                         [A{'loss': 0.0181, 'grad_norm': 0.0067657483741641045, 'learning_rate': 1e-05, 'num_tokens': 341993565.0, 'completions/mean_length': 6093.296875, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5929.95263671875, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 15788.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.12415502220392227, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02037961222231388, 'sampling/sampling_logp_difference/max': 4.56026554107666, 'sampling/importance_sampling_ratio/min': 0.010459281504154205, 'sampling/importance_sampling_ratio/mean': 0.9998992681503296, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9640207663178444, 'clip_ratio/low_mean': 2.2939096254503966e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.318064846600464e-06, 'clip_ratio/high_max': 5.272259386401856e-06, 'clip_ratio/region_mean': 2.4257160987417592e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 390/1024 [17:49:42<29:38:40, 168.33s/it][AINFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 391/1024 [17:52:24<29:17:06, 166.55s/it][A
+                                                         [A{'loss': 0.0306, 'grad_norm': 0.0018817185191437602, 'learning_rate': 1e-05, 'num_tokens': 342990545.0, 'completions/mean_length': 7620.09375, 'completions/min_length': 1076.0, 'completions/max_length': 16170.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7620.09375, 'completions/min_terminated_length': 1076.0, 'completions/max_terminated_length': 16170.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.18755048513412476, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021528441458940506, 'sampling/sampling_logp_difference/max': 7.281149864196777, 'sampling/importance_sampling_ratio/min': 0.0006883936002850533, 'sampling/importance_sampling_ratio/mean': 0.9999568462371826, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9773544892668724, 'clip_ratio/low_mean': 4.566248594528588e-05, 'clip_ratio/low_min': 4.402028480399167e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.566248594528588e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 391/1024 [17:52:24<29:17:06, 166.55s/it][AINFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 392/1024 [17:55:13<29:21:04, 167.19s/it][A
+                                                         [A{'loss': 0.087, 'grad_norm': 0.0052104732021689415, 'learning_rate': 1e-05, 'num_tokens': 343898791.0, 'completions/mean_length': 6963.984375, 'completions/min_length': 646.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6737.904296875, 'completions/min_terminated_length': 646.0, 'completions/max_terminated_length': 15053.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3621976971626282, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021434593945741653, 'sampling/sampling_logp_difference/max': 4.526732921600342, 'sampling/importance_sampling_ratio/min': 0.010815954767167568, 'sampling/importance_sampling_ratio/mean': 0.9999324679374695, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9683744385838509, 'clip_ratio/low_mean': 7.762144696243922e-05, 'clip_ratio/low_min': 2.4772080450929934e-05, 'clip_ratio/high_mean': 7.985045499481203e-06, 'clip_ratio/high_max': 2.6727505428425502e-05, 'clip_ratio/region_mean': 8.560649303035461e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 392/1024 [17:55:13<29:21:04, 167.19s/it][AINFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 393/1024 [17:57:50<28:44:55, 164.02s/it][A
+                                                         [A{'loss': 0.0085, 'grad_norm': 0.005151392426341772, 'learning_rate': 1e-05, 'num_tokens': 344779672.0, 'completions/mean_length': 6718.5078125, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6642.4013671875, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 15116.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0201373603194952, 'sampling/sampling_logp_difference/max': 6.025149822235107, 'sampling/importance_sampling_ratio/min': 0.0024171893019229174, 'sampling/importance_sampling_ratio/mean': 0.999840497970581, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9043834507465363, 'clip_ratio/low_mean': 2.5377692509209737e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.365133804640209e-06, 'clip_ratio/high_max': 1.3545108686230378e-05, 'clip_ratio/region_mean': 2.9742826200163108e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 393/1024 [17:57:50<28:44:55, 164.02s/it][AINFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 394/1024 [18:00:30<28:29:30, 162.81s/it][A
+                                                         [A{'loss': 0.0554, 'grad_norm': 0.0026606651954352856, 'learning_rate': 1e-05, 'num_tokens': 345701722.0, 'completions/mean_length': 7044.640625, 'completions/min_length': 411.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6820.49609375, 'completions/min_terminated_length': 411.0, 'completions/max_terminated_length': 16342.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.24146249890327454, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01981864869594574, 'sampling/sampling_logp_difference/max': 10.157968521118164, 'sampling/importance_sampling_ratio/min': 3.8765938370488584e-05, 'sampling/importance_sampling_ratio/mean': 1.0000128746032715, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9017335474491119, 'clip_ratio/low_mean': 2.739263118201052e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.927679188109323e-06, 'clip_ratio/high_max': 1.2263486723895767e-05, 'clip_ratio/region_mean': 3.132031042696326e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 394/1024 [18:00:30<28:29:30, 162.81s/it][AINFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▊      | 395/1024 [18:02:55<27:32:05, 157.59s/it][A
+                                                         [A{'loss': 0.0947, 'grad_norm': 0.003957017324864864, 'learning_rate': 1e-05, 'num_tokens': 346492810.0, 'completions/mean_length': 6031.875, 'completions/min_length': 520.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5950.3623046875, 'completions/min_terminated_length': 520.0, 'completions/max_terminated_length': 15476.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2858940362930298, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018711457028985023, 'sampling/sampling_logp_difference/max': 6.493460178375244, 'sampling/importance_sampling_ratio/min': 0.0015133036067709327, 'sampling/importance_sampling_ratio/mean': 0.9999707341194153, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8537683561444283, 'clip_ratio/low_mean': 4.819571529424138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.566390890024195e-06, 'clip_ratio/high_max': 1.026556356009678e-05, 'clip_ratio/region_mean': 5.0762106297952414e-05, 'epoch': 0.36}
+
+ 39%|███▊      | 395/1024 [18:02:55<27:32:05, 157.59s/it][AINFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▊      | 396/1024 [18:06:06<29:14:32, 167.63s/it][A
+                                                         [A{'loss': 0.1257, 'grad_norm': 0.002122648525983095, 'learning_rate': 1e-05, 'num_tokens': 347462871.0, 'completions/mean_length': 7429.3515625, 'completions/min_length': 1194.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6911.31396484375, 'completions/min_terminated_length': 1194.0, 'completions/max_terminated_length': 15942.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01998838409781456, 'sampling/sampling_logp_difference/max': 8.873497009277344, 'sampling/importance_sampling_ratio/min': 0.00014005196862854064, 'sampling/importance_sampling_ratio/mean': 1.0000076293945312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8821266070008278, 'clip_ratio/low_mean': 3.637038832948747e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4676222122034233e-06, 'clip_ratio/high_max': 5.870488848813693e-06, 'clip_ratio/region_mean': 3.783801014378696e-05, 'epoch': 0.36}
+
+ 39%|███▊      | 396/1024 [18:06:06<29:14:32, 167.63s/it][AINFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 397/1024 [18:09:09<29:58:12, 172.08s/it][A
+                                                         [A{'loss': 0.0676, 'grad_norm': 0.002546454081311822, 'learning_rate': 1e-05, 'num_tokens': 348395842.0, 'completions/mean_length': 7131.7109375, 'completions/min_length': 821.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6833.25, 'completions/min_terminated_length': 821.0, 'completions/max_terminated_length': 15761.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0193922221660614, 'sampling/sampling_logp_difference/max': 8.436627388000488, 'sampling/importance_sampling_ratio/min': 0.0002167800412280485, 'sampling/importance_sampling_ratio/mean': 0.999964714050293, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8575824722647667, 'clip_ratio/low_mean': 6.443337406381033e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6659830609787605e-06, 'clip_ratio/high_max': 1.0663932243915042e-05, 'clip_ratio/region_mean': 6.709935701110226e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 397/1024 [18:09:09<29:58:12, 172.08s/it][AINFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 398/1024 [18:11:52<29:26:57, 169.36s/it][A
+                                                         [A{'loss': 0.0818, 'grad_norm': 0.00492837093770504, 'learning_rate': 1e-05, 'num_tokens': 349292790.0, 'completions/mean_length': 6858.34375, 'completions/min_length': 772.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6707.14306640625, 'completions/min_terminated_length': 772.0, 'completions/max_terminated_length': 16200.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.1949220597743988, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020318543538451195, 'sampling/sampling_logp_difference/max': 6.79857063293457, 'sampling/importance_sampling_ratio/min': 0.0011153683299198747, 'sampling/importance_sampling_ratio/mean': 0.9998850226402283, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9539813920855522, 'clip_ratio/low_mean': 3.932982110654848e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.618344165573944e-07, 'clip_ratio/high_max': 3.847337666229578e-06, 'clip_ratio/region_mean': 4.029165563679271e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 398/1024 [18:11:52<29:26:57, 169.36s/it][AINFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 07:38:48,166 - math_verify.grader - WARNING - Timeout during comparison
+
+ 39%|███▉      | 399/1024 [18:14:51<29:56:15, 172.44s/it][A
+                                                         [A{'loss': 0.0273, 'grad_norm': 0.004895905964076519, 'learning_rate': 1e-05, 'num_tokens': 350312556.0, 'completions/mean_length': 7809.984375, 'completions/min_length': 1002.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7533.40283203125, 'completions/min_terminated_length': 1002.0, 'completions/max_terminated_length': 15261.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.22567616403102875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018754083663225174, 'sampling/sampling_logp_difference/max': 7.0799760818481445, 'sampling/importance_sampling_ratio/min': 0.0008417933131568134, 'sampling/importance_sampling_ratio/mean': 0.9999260306358337, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8353303670883179, 'clip_ratio/low_mean': 3.8245348378040944e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.22843152389396e-06, 'clip_ratio/high_max': 1.291372609557584e-05, 'clip_ratio/region_mean': 4.1473780811429606e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 399/1024 [18:14:51<29:56:15, 172.44s/it][AINFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 400/1024 [18:17:33<29:19:50, 169.21s/it][A
+                                                         [A{'loss': 0.0402, 'grad_norm': 0.0032397822942584753, 'learning_rate': 1e-05, 'num_tokens': 351252755.0, 'completions/mean_length': 7194.9296875, 'completions/min_length': 233.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6821.39013671875, 'completions/min_terminated_length': 233.0, 'completions/max_terminated_length': 15057.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.19438527524471283, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02105094864964485, 'sampling/sampling_logp_difference/max': 8.370504379272461, 'sampling/importance_sampling_ratio/min': 0.00023159870761446655, 'sampling/importance_sampling_ratio/mean': 0.9998766183853149, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9744522422552109, 'clip_ratio/low_mean': 3.196108968950284e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5690324011738994e-06, 'clip_ratio/high_max': 1.1250081115576904e-05, 'clip_ratio/region_mean': 3.553012152224255e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 400/1024 [18:17:33<29:19:50, 169.21s/it][AINFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 401/1024 [18:20:13<28:48:34, 166.48s/it][A
+                                                         [A{'loss': 0.0424, 'grad_norm': 0.0031576494220644236, 'learning_rate': 1e-05, 'num_tokens': 352145873.0, 'completions/mean_length': 6836.234375, 'completions/min_length': 379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6607.08837890625, 'completions/min_terminated_length': 379.0, 'completions/max_terminated_length': 15745.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020445333793759346, 'sampling/sampling_logp_difference/max': 6.727474689483643, 'sampling/importance_sampling_ratio/min': 0.0011975533561781049, 'sampling/importance_sampling_ratio/mean': 0.9999266862869263, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9149863049387932, 'clip_ratio/low_mean': 2.2670621888210007e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7451138774049468e-06, 'clip_ratio/high_max': 6.980455509619787e-06, 'clip_ratio/region_mean': 2.441573599298863e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 401/1024 [18:20:13<28:48:34, 166.48s/it][AINFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 402/1024 [18:23:03<28:57:20, 167.59s/it][A
+                                                         [A{'loss': 0.051, 'grad_norm': 0.003970830701291561, 'learning_rate': 1e-05, 'num_tokens': 353056405.0, 'completions/mean_length': 6942.65625, 'completions/min_length': 175.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6638.0966796875, 'completions/min_terminated_length': 175.0, 'completions/max_terminated_length': 16380.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3282659649848938, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018101349472999573, 'sampling/sampling_logp_difference/max': 11.687329292297363, 'sampling/importance_sampling_ratio/min': 8.399576472584158e-06, 'sampling/importance_sampling_ratio/mean': 1.0000462532043457, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7541583999991417, 'clip_ratio/low_mean': 5.359988131203863e-05, 'clip_ratio/low_min': 1.3856095392839052e-05, 'clip_ratio/high_mean': 5.889334147468617e-06, 'clip_ratio/high_max': 2.3557336589874467e-05, 'clip_ratio/region_mean': 5.9489215118446737e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 402/1024 [18:23:03<28:57:20, 167.59s/it][AINFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 403/1024 [18:25:37<28:10:05, 163.29s/it][A
+                                                         [A{'loss': 0.029, 'grad_norm': 0.0043656788766384125, 'learning_rate': 1e-05, 'num_tokens': 353844661.0, 'completions/mean_length': 6022.1875, 'completions/min_length': 1285.0, 'completions/max_length': 14786.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6022.1875, 'completions/min_terminated_length': 1285.0, 'completions/max_terminated_length': 14786.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.22225631773471832, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020655371248722076, 'sampling/sampling_logp_difference/max': 2.9993722438812256, 'sampling/importance_sampling_ratio/min': 0.04981832951307297, 'sampling/importance_sampling_ratio/mean': 0.9999772310256958, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9535745903849602, 'clip_ratio/low_mean': 1.968103515537223e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.517377525800839e-06, 'clip_ratio/high_max': 2.6139805413549766e-05, 'clip_ratio/region_mean': 2.7198412681173068e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 403/1024 [18:25:37<28:10:05, 163.29s/it][AINFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 404/1024 [18:28:31<28:42:05, 166.65s/it][A
+                                                         [A{'loss': 0.006, 'grad_norm': 0.006543307099491358, 'learning_rate': 1e-05, 'num_tokens': 354894689.0, 'completions/mean_length': 8068.96875, 'completions/min_length': 468.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7869.408203125, 'completions/min_terminated_length': 468.0, 'completions/max_terminated_length': 15906.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.24988999962806702, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021852033212780952, 'sampling/sampling_logp_difference/max': 9.614944458007812, 'sampling/importance_sampling_ratio/min': 6.672408926533535e-05, 'sampling/importance_sampling_ratio/mean': 0.9999514818191528, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9473539590835571, 'clip_ratio/low_mean': 5.21388310517068e-05, 'clip_ratio/low_min': 2.633131089169183e-06, 'clip_ratio/high_mean': 2.9508817647183605e-06, 'clip_ratio/high_max': 9.152076700047473e-06, 'clip_ratio/region_mean': 5.508971298695542e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 404/1024 [18:28:31<28:42:05, 166.65s/it][AINFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 405/1024 [18:31:09<28:11:11, 163.93s/it][A
+                                                         [A{'loss': 0.0293, 'grad_norm': 0.003351036459207535, 'learning_rate': 1e-05, 'num_tokens': 355677273.0, 'completions/mean_length': 5960.1875, 'completions/min_length': 491.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5878.1103515625, 'completions/min_terminated_length': 491.0, 'completions/max_terminated_length': 15748.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.31642353534698486, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021779976785182953, 'sampling/sampling_logp_difference/max': 6.656237602233887, 'sampling/importance_sampling_ratio/min': 0.0012859756825491786, 'sampling/importance_sampling_ratio/mean': 0.9999220371246338, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9564141109585762, 'clip_ratio/low_mean': 5.5152235972855124e-05, 'clip_ratio/low_min': 1.0455875781190116e-05, 'clip_ratio/high_mean': 7.4048172109542065e-06, 'clip_ratio/high_max': 2.9619268843816826e-05, 'clip_ratio/region_mean': 6.255705375224352e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 405/1024 [18:31:09<28:11:11, 163.93s/it][AINFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 406/1024 [18:34:06<28:49:01, 167.87s/it][A
+                                                         [A{'loss': 0.039, 'grad_norm': 0.0031219006050378084, 'learning_rate': 1e-05, 'num_tokens': 356675829.0, 'completions/mean_length': 7620.21875, 'completions/min_length': 328.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7189.212890625, 'completions/min_terminated_length': 328.0, 'completions/max_terminated_length': 15669.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.1751839816570282, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021951109170913696, 'sampling/sampling_logp_difference/max': 4.591080188751221, 'sampling/importance_sampling_ratio/min': 0.010141897015273571, 'sampling/importance_sampling_ratio/mean': 1.0001060962677002, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.035948596894741, 'clip_ratio/low_mean': 3.758041248147492e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.989421491543908e-06, 'clip_ratio/high_max': 7.957685966175632e-06, 'clip_ratio/region_mean': 3.956983414354909e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 406/1024 [18:34:06<28:49:01, 167.87s/it][AINFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 407/1024 [18:36:39<28:00:30, 163.42s/it][A
+                                                         [A{'loss': 0.0471, 'grad_norm': 0.002810312667861581, 'learning_rate': 1e-05, 'num_tokens': 357438712.0, 'completions/mean_length': 5806.0234375, 'completions/min_length': 1319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5638.119140625, 'completions/min_terminated_length': 1319.0, 'completions/max_terminated_length': 14038.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.22832970321178436, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01965375244617462, 'sampling/sampling_logp_difference/max': 6.747459888458252, 'sampling/importance_sampling_ratio/min': 0.0011738575994968414, 'sampling/importance_sampling_ratio/mean': 0.9999280571937561, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8977029845118523, 'clip_ratio/low_mean': 3.914574369900947e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.7169204978890775e-06, 'clip_ratio/high_max': 2.286768199155631e-05, 'clip_ratio/region_mean': 4.486266482217616e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 407/1024 [18:36:39<28:00:30, 163.42s/it][AINFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 408/1024 [18:39:04<27:01:59, 157.99s/it][A
+                                                         [A{'loss': 0.0517, 'grad_norm': 0.004516562446951866, 'learning_rate': 1e-05, 'num_tokens': 358296731.0, 'completions/mean_length': 6537.4609375, 'completions/min_length': 842.0, 'completions/max_length': 15705.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6537.4609375, 'completions/min_terminated_length': 842.0, 'completions/max_terminated_length': 15705.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.1830746978521347, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021242395043373108, 'sampling/sampling_logp_difference/max': 12.946335792541504, 'sampling/importance_sampling_ratio/min': 2.384942035860149e-06, 'sampling/importance_sampling_ratio/mean': 0.9999170303344727, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9577726796269417, 'clip_ratio/low_mean': 3.186109779562685e-05, 'clip_ratio/low_min': 4.3511558942554984e-06, 'clip_ratio/high_mean': 3.054844910366228e-06, 'clip_ratio/high_max': 1.2219379641464911e-05, 'clip_ratio/region_mean': 3.4915943160740426e-05, 'epoch': 0.38}
+
+ 40%|███▉      | 408/1024 [18:39:04<27:01:59, 157.99s/it][AINFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|███▉      | 409/1024 [18:41:51<27:27:01, 160.69s/it][A
+                                                         [A{'loss': 0.05, 'grad_norm': 0.003542230697348714, 'learning_rate': 1e-05, 'num_tokens': 359327001.0, 'completions/mean_length': 7896.671875, 'completions/min_length': 1047.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7622.88671875, 'completions/min_terminated_length': 1047.0, 'completions/max_terminated_length': 16360.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.23645778000354767, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020085681229829788, 'sampling/sampling_logp_difference/max': 9.124931335449219, 'sampling/importance_sampling_ratio/min': 0.00010891625424847007, 'sampling/importance_sampling_ratio/mean': 0.9998560547828674, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9163230583071709, 'clip_ratio/low_mean': 3.026239573955536e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6056723047295236e-06, 'clip_ratio/high_max': 1.4422689218918094e-05, 'clip_ratio/region_mean': 3.3868068385345396e-05, 'epoch': 0.38}
+
+ 40%|███▉      | 409/1024 [18:41:51<27:27:01, 160.69s/it][AINFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 410/1024 [18:44:21<26:52:06, 157.53s/it][A
+                                                         [A{'loss': 0.0518, 'grad_norm': 0.0035069347359240055, 'learning_rate': 1e-05, 'num_tokens': 360208780.0, 'completions/mean_length': 6728.7109375, 'completions/min_length': 454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6652.68505859375, 'completions/min_terminated_length': 454.0, 'completions/max_terminated_length': 15297.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021022530272603035, 'sampling/sampling_logp_difference/max': 11.124998092651367, 'sampling/importance_sampling_ratio/min': 1.4739226571691688e-05, 'sampling/importance_sampling_ratio/mean': 0.9999571442604065, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9010183215141296, 'clip_ratio/low_mean': 4.2465159026505717e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.474494003010477e-06, 'clip_ratio/high_max': 1.7827243254942005e-05, 'clip_ratio/region_mean': 4.793965263161226e-05, 'epoch': 0.38}
+
+ 40%|████      | 410/1024 [18:44:21<26:52:06, 157.53s/it][AINFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 411/1024 [18:47:25<28:10:49, 165.50s/it][A
+                                                         [A{'loss': 0.0221, 'grad_norm': 0.0033910400234162807, 'learning_rate': 1e-05, 'num_tokens': 361098567.0, 'completions/mean_length': 6800.3984375, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6491.25, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 16167.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2306838035583496, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019660964608192444, 'sampling/sampling_logp_difference/max': 6.536596298217773, 'sampling/importance_sampling_ratio/min': 0.001449413481168449, 'sampling/importance_sampling_ratio/mean': 0.9998576641082764, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8654960840940475, 'clip_ratio/low_mean': 2.8587513156708155e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.594247348497447e-06, 'clip_ratio/high_max': 1.0376989393989788e-05, 'clip_ratio/region_mean': 3.1181759936771414e-05, 'epoch': 0.38}
+
+ 40%|████      | 411/1024 [18:47:25<28:10:49, 165.50s/it][AINFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 412/1024 [18:50:04<27:48:41, 163.60s/it][A
+                                                         [A{'loss': 0.0834, 'grad_norm': 0.0036110079381614923, 'learning_rate': 1e-05, 'num_tokens': 362027520.0, 'completions/mean_length': 7103.4453125, 'completions/min_length': 1711.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6956.13525390625, 'completions/min_terminated_length': 1711.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.33797892928123474, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01939362846314907, 'sampling/sampling_logp_difference/max': 11.458046913146973, 'sampling/importance_sampling_ratio/min': 1.0564122931100428e-05, 'sampling/importance_sampling_ratio/mean': 0.9999338984489441, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8317076042294502, 'clip_ratio/low_mean': 5.8515578757578623e-05, 'clip_ratio/low_min': 1.0348648629587842e-05, 'clip_ratio/high_mean': 7.792090059410839e-06, 'clip_ratio/high_max': 2.3068858354236e-05, 'clip_ratio/region_mean': 6.630766870330262e-05, 'epoch': 0.38}
+
+ 40%|████      | 412/1024 [18:50:04<27:48:41, 163.60s/it][AINFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 413/1024 [18:52:56<28:10:21, 165.99s/it][A
+                                                         [A{'loss': 0.0756, 'grad_norm': 0.002141098491847515, 'learning_rate': 1e-05, 'num_tokens': 362985207.0, 'completions/mean_length': 7344.9296875, 'completions/min_length': 1368.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6900.384765625, 'completions/min_terminated_length': 1368.0, 'completions/max_terminated_length': 15830.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01929464004933834, 'sampling/sampling_logp_difference/max': 10.874617576599121, 'sampling/importance_sampling_ratio/min': 1.8932745661004446e-05, 'sampling/importance_sampling_ratio/mean': 0.9999322891235352, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8387318029999733, 'clip_ratio/low_mean': 5.127149995587388e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.780986948091595e-07, 'clip_ratio/high_max': 3.112394779236638e-06, 'clip_ratio/region_mean': 5.204959859383962e-05, 'epoch': 0.38}
+
+ 40%|████      | 413/1024 [18:52:56<28:10:21, 165.99s/it][AINFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 40%|████      | 414/1024 [18:55:51<28:36:34, 168.84s/it][A
+                                                         [A{'loss': 0.0608, 'grad_norm': 0.0015244127716869116, 'learning_rate': 1e-05, 'num_tokens': 363823914.0, 'completions/mean_length': 6377.7734375, 'completions/min_length': 839.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6218.94482421875, 'completions/min_terminated_length': 839.0, 'completions/max_terminated_length': 16137.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.1988610327243805, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020688029006123543, 'sampling/sampling_logp_difference/max': 5.061592102050781, 'sampling/importance_sampling_ratio/min': 0.006335465237498283, 'sampling/importance_sampling_ratio/mean': 0.9999363422393799, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9732858911156654, 'clip_ratio/low_mean': 1.7854434247510653e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3150696531738504e-06, 'clip_ratio/high_max': 5.2602786126954015e-06, 'clip_ratio/region_mean': 1.9169503786997666e-05, 'epoch': 0.38}
+
+ 40%|████      | 414/1024 [18:55:51<28:36:34, 168.84s/it][AINFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 415/1024 [18:58:31<28:05:27, 166.05s/it][A
+                                                         [A{'loss': 0.0311, 'grad_norm': 0.002647512126713991, 'learning_rate': 1e-05, 'num_tokens': 364561127.0, 'completions/mean_length': 5599.7890625, 'completions/min_length': 422.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5340.96826171875, 'completions/min_terminated_length': 422.0, 'completions/max_terminated_length': 14456.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01878243312239647, 'sampling/sampling_logp_difference/max': 12.952398300170898, 'sampling/importance_sampling_ratio/min': 2.370526999584399e-06, 'sampling/importance_sampling_ratio/mean': 0.9999077916145325, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8872368410229683, 'clip_ratio/low_mean': 3.3802934012783226e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.548875148837396e-06, 'clip_ratio/high_max': 2.6195500595349586e-05, 'clip_ratio/region_mean': 4.035180882056011e-05, 'epoch': 0.38}
+
+ 41%|████      | 415/1024 [18:58:31<28:05:27, 166.05s/it][AINFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 416/1024 [19:01:26<28:30:45, 168.83s/it][A
+                                                         [A{'loss': 0.0541, 'grad_norm': 0.0018051012884825468, 'learning_rate': 1e-05, 'num_tokens': 365590124.0, 'completions/mean_length': 7877.2890625, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7385.1650390625, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 15905.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.28407180309295654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019809434190392494, 'sampling/sampling_logp_difference/max': 7.800533294677734, 'sampling/importance_sampling_ratio/min': 0.0004095165350008756, 'sampling/importance_sampling_ratio/mean': 0.9999774694442749, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8416353687644005, 'clip_ratio/low_mean': 7.215861739950924e-05, 'clip_ratio/low_min': 1.4898997051204788e-05, 'clip_ratio/high_mean': 5.3931973980070325e-06, 'clip_ratio/high_max': 2.157278959202813e-05, 'clip_ratio/region_mean': 7.755181559332414e-05, 'epoch': 0.38}
+
+ 41%|████      | 416/1024 [19:01:26<28:30:45, 168.83s/it][AINFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 417/1024 [19:04:11<28:15:50, 167.63s/it][A
+                                                         [A{'loss': 0.0146, 'grad_norm': 0.004550795070827007, 'learning_rate': 1e-05, 'num_tokens': 366486337.0, 'completions/mean_length': 6836.7890625, 'completions/min_length': 909.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6200.30859375, 'completions/min_terminated_length': 909.0, 'completions/max_terminated_length': 16083.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.22620806097984314, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01992485672235489, 'sampling/sampling_logp_difference/max': 9.124993324279785, 'sampling/importance_sampling_ratio/min': 0.0001089095021598041, 'sampling/importance_sampling_ratio/mean': 0.9999873638153076, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8647575601935387, 'clip_ratio/low_mean': 4.230594890941575e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.352486593641515e-06, 'clip_ratio/high_max': 2.540994637456606e-05, 'clip_ratio/region_mean': 4.8658435844117776e-05, 'epoch': 0.38}
+
+ 41%|████      | 417/1024 [19:04:11<28:15:50, 167.63s/it][AINFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 418/1024 [19:06:47<27:37:23, 164.10s/it][A
+                                                         [A{'loss': 0.1054, 'grad_norm': 0.005958946421742439, 'learning_rate': 1e-05, 'num_tokens': 367386163.0, 'completions/mean_length': 6884.953125, 'completions/min_length': 1289.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6417.78662109375, 'completions/min_terminated_length': 1289.0, 'completions/max_terminated_length': 16286.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019436441361904144, 'sampling/sampling_logp_difference/max': 11.562139511108398, 'sampling/importance_sampling_ratio/min': 9.519772902422119e-06, 'sampling/importance_sampling_ratio/mean': 1.0000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8691708743572235, 'clip_ratio/low_mean': 3.5717548257707676e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8981661444049678e-06, 'clip_ratio/high_max': 1.1592664577619871e-05, 'clip_ratio/region_mean': 3.861571451579948e-05, 'epoch': 0.38}
+
+ 41%|████      | 418/1024 [19:06:47<27:37:23, 164.10s/it][AINFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 419/1024 [19:09:35<27:46:02, 165.23s/it][A
+                                                         [A{'loss': 0.1918, 'grad_norm': 0.00558120384812355, 'learning_rate': 1e-05, 'num_tokens': 368357500.0, 'completions/mean_length': 7439.1328125, 'completions/min_length': 938.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7150.58837890625, 'completions/min_terminated_length': 938.0, 'completions/max_terminated_length': 15574.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.3795146346092224, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018824251368641853, 'sampling/sampling_logp_difference/max': 9.062491416931152, 'sampling/importance_sampling_ratio/min': 0.0001159337698481977, 'sampling/importance_sampling_ratio/mean': 0.9999570250511169, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.795464999973774, 'clip_ratio/low_mean': 3.938097847822064e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.455849524580117e-06, 'clip_ratio/high_max': 2.7658640192385064e-05, 'clip_ratio/region_mean': 4.7836828116487595e-05, 'epoch': 0.39}
+
+ 41%|████      | 419/1024 [19:09:35<27:46:02, 165.23s/it][AINFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 420/1024 [19:12:18<27:35:39, 164.47s/it][A
+                                                         [A{'loss': 0.0859, 'grad_norm': 0.004628168884664774, 'learning_rate': 1e-05, 'num_tokens': 369242920.0, 'completions/mean_length': 6751.53125, 'completions/min_length': 715.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6520.3525390625, 'completions/min_terminated_length': 715.0, 'completions/max_terminated_length': 16236.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019376013427972794, 'sampling/sampling_logp_difference/max': 7.406209468841553, 'sampling/importance_sampling_ratio/min': 0.0006074689445085824, 'sampling/importance_sampling_ratio/mean': 0.9999655485153198, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9450879693031311, 'clip_ratio/low_mean': 3.0958593640662e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1273888819450804e-06, 'clip_ratio/high_max': 8.509555527780321e-06, 'clip_ratio/region_mean': 3.308598269313734e-05, 'epoch': 0.39}
+
+ 41%|████      | 420/1024 [19:12:18<27:35:39, 164.47s/it][AINFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 421/1024 [19:15:33<29:05:45, 173.71s/it][A
+                                                         [A{'loss': 0.1066, 'grad_norm': 0.00389425759203732, 'learning_rate': 1e-05, 'num_tokens': 370159510.0, 'completions/mean_length': 7023.296875, 'completions/min_length': 1628.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6315.3447265625, 'completions/min_terminated_length': 1628.0, 'completions/max_terminated_length': 16164.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.323777437210083, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016914553940296173, 'sampling/sampling_logp_difference/max': 8.872963905334473, 'sampling/importance_sampling_ratio/min': 0.00014012664905749261, 'sampling/importance_sampling_ratio/mean': 0.9999127388000488, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7378111630678177, 'clip_ratio/low_mean': 4.86290555272717e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.572105126499082e-06, 'clip_ratio/high_max': 1.8288420505996328e-05, 'clip_ratio/region_mean': 5.320115997164976e-05, 'epoch': 0.39}
+
+ 41%|████      | 421/1024 [19:15:33<29:05:45, 173.71s/it][AINFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 422/1024 [19:18:14<28:25:39, 170.00s/it][A
+                                                         [A{'loss': 0.0149, 'grad_norm': 0.004324545152485371, 'learning_rate': 1e-05, 'num_tokens': 371162773.0, 'completions/mean_length': 7702.3046875, 'completions/min_length': 423.0, 'completions/max_length': 16018.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7702.3046875, 'completions/min_terminated_length': 423.0, 'completions/max_terminated_length': 16018.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.23250606656074524, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020495830103754997, 'sampling/sampling_logp_difference/max': 10.687313079833984, 'sampling/importance_sampling_ratio/min': 2.283278627146501e-05, 'sampling/importance_sampling_ratio/mean': 1.00001060962677, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9053447172045708, 'clip_ratio/low_mean': 2.3538930747690756e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.948400371380558e-06, 'clip_ratio/high_max': 2.1269573153404053e-05, 'clip_ratio/region_mean': 2.9487331687505502e-05, 'epoch': 0.39}
+
+ 41%|████      | 422/1024 [19:18:14<28:25:39, 170.00s/it][AINFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████▏     | 423/1024 [19:20:52<27:45:32, 166.28s/it][A
+                                                         [A{'loss': 0.0237, 'grad_norm': 0.003239463549107313, 'learning_rate': 1e-05, 'num_tokens': 372067241.0, 'completions/mean_length': 6904.78125, 'completions/min_length': 432.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6754.31787109375, 'completions/min_terminated_length': 432.0, 'completions/max_terminated_length': 15295.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.32719242572784424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019042208790779114, 'sampling/sampling_logp_difference/max': 8.999999046325684, 'sampling/importance_sampling_ratio/min': 0.00012340991816017777, 'sampling/importance_sampling_ratio/mean': 0.9999598264694214, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7991176024079323, 'clip_ratio/low_mean': 5.831611520079605e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5737292048688687e-06, 'clip_ratio/high_max': 1.0294916819475475e-05, 'clip_ratio/region_mean': 6.088984559937671e-05, 'epoch': 0.39}
+
+ 41%|████▏     | 423/1024 [19:20:52<27:45:32, 166.28s/it][AINFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████▏     | 424/1024 [19:23:34<27:30:52, 165.09s/it][A
+                                                         [A{'loss': 0.0587, 'grad_norm': 0.0015464330790564418, 'learning_rate': 1e-05, 'num_tokens': 372866072.0, 'completions/mean_length': 6107.7421875, 'completions/min_length': 89.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5602.35205078125, 'completions/min_terminated_length': 89.0, 'completions/max_terminated_length': 15399.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.1820138692855835, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019793221727013588, 'sampling/sampling_logp_difference/max': 8.306756019592285, 'sampling/importance_sampling_ratio/min': 0.00024684349773451686, 'sampling/importance_sampling_ratio/mean': 0.999971330165863, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9495253190398216, 'clip_ratio/low_mean': 1.552133551285806e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.926559305815317e-06, 'clip_ratio/high_max': 2.7261318791715894e-05, 'clip_ratio/region_mean': 2.3447895273420727e-05, 'epoch': 0.39}
+
+ 41%|████▏     | 424/1024 [19:23:34<27:30:52, 165.09s/it][AINFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 425/1024 [19:26:12<27:07:00, 162.97s/it][A
+                                                         [A{'loss': 0.1124, 'grad_norm': 0.0024811832699924707, 'learning_rate': 1e-05, 'num_tokens': 373663463.0, 'completions/mean_length': 6079.8046875, 'completions/min_length': 1082.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5747.4111328125, 'completions/min_terminated_length': 1082.0, 'completions/max_terminated_length': 15939.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.2630355656147003, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017151469364762306, 'sampling/sampling_logp_difference/max': 8.550286293029785, 'sampling/importance_sampling_ratio/min': 0.00019348970090504736, 'sampling/importance_sampling_ratio/mean': 0.9999743103981018, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8005363270640373, 'clip_ratio/low_mean': 3.261690835643094e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.533324717063806e-06, 'clip_ratio/high_max': 2.457227401464479e-05, 'clip_ratio/region_mean': 4.115023284612107e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 425/1024 [19:26:12<27:07:00, 162.97s/it][AINFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 426/1024 [19:28:51<26:52:48, 161.82s/it][A
+                                                         [A{'loss': 0.0959, 'grad_norm': 0.0031475063879042864, 'learning_rate': 1e-05, 'num_tokens': 374517492.0, 'completions/mean_length': 6453.7890625, 'completions/min_length': 347.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6375.5986328125, 'completions/min_terminated_length': 347.0, 'completions/max_terminated_length': 14925.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.19910329580307007, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019899867475032806, 'sampling/sampling_logp_difference/max': 4.156344890594482, 'sampling/importance_sampling_ratio/min': 0.015664709731936455, 'sampling/importance_sampling_ratio/mean': 0.9999594688415527, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9212624430656433, 'clip_ratio/low_mean': 2.132218082806503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.429997251369059e-07, 'clip_ratio/high_max': 3.3719989005476236e-06, 'clip_ratio/region_mean': 2.2165180553201935e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 426/1024 [19:28:51<26:52:48, 161.82s/it][AINFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 427/1024 [19:31:16<26:00:01, 156.79s/it][A
+                                                         [A{'loss': 0.0276, 'grad_norm': 0.004200868774205446, 'learning_rate': 1e-05, 'num_tokens': 375320339.0, 'completions/mean_length': 6126.9921875, 'completions/min_length': 1106.0, 'completions/max_length': 16159.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6126.9921875, 'completions/min_terminated_length': 1106.0, 'completions/max_terminated_length': 16159.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01833093911409378, 'sampling/sampling_logp_difference/max': 5.156249046325684, 'sampling/importance_sampling_ratio/min': 0.005763276945799589, 'sampling/importance_sampling_ratio/mean': 0.9999815225601196, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8252849578857422, 'clip_ratio/low_mean': 3.784128080042137e-05, 'clip_ratio/low_min': 3.7751804029539926e-06, 'clip_ratio/high_mean': 5.984868664654641e-06, 'clip_ratio/high_max': 1.907509408738406e-05, 'clip_ratio/region_mean': 4.382614952191943e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 427/1024 [19:31:16<26:00:01, 156.79s/it][AINFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 428/1024 [19:34:03<26:27:09, 159.78s/it][A
+                                                         [A{'loss': 0.0481, 'grad_norm': 0.003204014617949724, 'learning_rate': 1e-05, 'num_tokens': 376201015.0, 'completions/mean_length': 6739.09375, 'completions/min_length': 1228.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6427.9677734375, 'completions/min_terminated_length': 1228.0, 'completions/max_terminated_length': 15411.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.37086254358291626, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018961725756525993, 'sampling/sampling_logp_difference/max': 9.195985794067383, 'sampling/importance_sampling_ratio/min': 0.00010144581028725952, 'sampling/importance_sampling_ratio/mean': 0.9998303651809692, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8008574098348618, 'clip_ratio/low_mean': 6.169724406390742e-05, 'clip_ratio/low_min': 7.494657666029525e-06, 'clip_ratio/high_mean': 5.476571459439583e-06, 'clip_ratio/high_max': 1.8918785372079583e-05, 'clip_ratio/region_mean': 6.717381506859965e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 428/1024 [19:34:03<26:27:09, 159.78s/it][AINFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 429/1024 [19:36:58<27:10:19, 164.40s/it][A
+                                                         [A{'loss': 0.0299, 'grad_norm': 0.0039763906970620155, 'learning_rate': 1e-05, 'num_tokens': 377149650.0, 'completions/mean_length': 7245.8984375, 'completions/min_length': 1306.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6951.12060546875, 'completions/min_terminated_length': 1306.0, 'completions/max_terminated_length': 15634.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020948028191924095, 'sampling/sampling_logp_difference/max': 9.420292854309082, 'sampling/importance_sampling_ratio/min': 8.106228051474318e-05, 'sampling/importance_sampling_ratio/mean': 1.0000600814819336, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0351596996188164, 'clip_ratio/low_mean': 5.3925050679026754e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.389697269540193e-06, 'clip_ratio/high_max': 1.3558789078160771e-05, 'clip_ratio/region_mean': 5.731474743697618e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 429/1024 [19:36:58<27:10:19, 164.40s/it][AINFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 430/1024 [19:39:49<27:26:35, 166.32s/it][A
+                                                         [A{'loss': 0.0195, 'grad_norm': 0.0031417158897966146, 'learning_rate': 1e-05, 'num_tokens': 378057802.0, 'completions/mean_length': 6958.625, 'completions/min_length': 1047.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6495.08154296875, 'completions/min_terminated_length': 1047.0, 'completions/max_terminated_length': 15608.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.35771697759628296, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019185224547982216, 'sampling/sampling_logp_difference/max': 9.187026023864746, 'sampling/importance_sampling_ratio/min': 0.00010235882655251771, 'sampling/importance_sampling_ratio/mean': 0.9999384880065918, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8360240310430527, 'clip_ratio/low_mean': 4.6149686397711775e-05, 'clip_ratio/low_min': 3.006686938533676e-06, 'clip_ratio/high_mean': 4.259903903403028e-06, 'clip_ratio/high_max': 1.4580486549675697e-05, 'clip_ratio/region_mean': 5.04095905853319e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 430/1024 [19:39:49<27:26:35, 166.32s/it][AINFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 431/1024 [19:42:18<26:32:33, 161.14s/it][A
+                                                         [A{'loss': 0.096, 'grad_norm': 0.004943124484270811, 'learning_rate': 1e-05, 'num_tokens': 378808021.0, 'completions/mean_length': 5696.3984375, 'completions/min_length': 312.0, 'completions/max_length': 15410.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5696.3984375, 'completions/min_terminated_length': 312.0, 'completions/max_terminated_length': 15410.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.31246691942214966, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018845941871404648, 'sampling/sampling_logp_difference/max': 6.499474048614502, 'sampling/importance_sampling_ratio/min': 0.0015042300801724195, 'sampling/importance_sampling_ratio/mean': 0.9999057054519653, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7887749597430229, 'clip_ratio/low_mean': 5.096616632727091e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6704084373486694e-06, 'clip_ratio/high_max': 6.681633749394678e-06, 'clip_ratio/region_mean': 5.263657521936693e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 431/1024 [19:42:18<26:32:33, 161.14s/it][AINFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 432/1024 [19:44:57<26:21:59, 160.34s/it][A
+                                                         [A{'loss': 0.0546, 'grad_norm': 0.00595651101320982, 'learning_rate': 1e-05, 'num_tokens': 379659710.0, 'completions/mean_length': 6480.8828125, 'completions/min_length': 1013.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6323.69091796875, 'completions/min_terminated_length': 1013.0, 'completions/max_terminated_length': 14233.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01906527951359749, 'sampling/sampling_logp_difference/max': 6.325125217437744, 'sampling/importance_sampling_ratio/min': 0.0017907419241964817, 'sampling/importance_sampling_ratio/mean': 0.9998855590820312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8796411231160164, 'clip_ratio/low_mean': 3.513921649300755e-05, 'clip_ratio/low_min': 6.075038982089609e-06, 'clip_ratio/high_mean': 5.417880970526312e-06, 'clip_ratio/high_max': 1.7526824194646906e-05, 'clip_ratio/region_mean': 4.0557096895099676e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 432/1024 [19:44:57<26:21:59, 160.34s/it][AINFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 433/1024 [19:47:51<27:00:53, 164.56s/it][A
+                                                         [A{'loss': 0.0683, 'grad_norm': 0.0024527597706764936, 'learning_rate': 1e-05, 'num_tokens': 380640720.0, 'completions/mean_length': 7501.703125, 'completions/min_length': 680.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6829.93310546875, 'completions/min_terminated_length': 680.0, 'completions/max_terminated_length': 16204.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01873261108994484, 'sampling/sampling_logp_difference/max': 13.93749713897705, 'sampling/importance_sampling_ratio/min': 8.851602615322918e-07, 'sampling/importance_sampling_ratio/mean': 0.9999595880508423, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.786028303205967, 'clip_ratio/low_mean': 2.4512424602107785e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.4512424602107785e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 433/1024 [19:47:51<27:00:53, 164.56s/it][AINFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 434/1024 [19:50:20<26:12:59, 159.97s/it][A
+                                                         [A{'loss': 0.0514, 'grad_norm': 0.004280989523977041, 'learning_rate': 1e-05, 'num_tokens': 381377981.0, 'completions/mean_length': 5619.2890625, 'completions/min_length': 602.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5448.4208984375, 'completions/min_terminated_length': 602.0, 'completions/max_terminated_length': 15185.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017923470586538315, 'sampling/sampling_logp_difference/max': 6.883193492889404, 'sampling/importance_sampling_ratio/min': 0.0010248658945783973, 'sampling/importance_sampling_ratio/mean': 0.9999443292617798, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8098893761634827, 'clip_ratio/low_mean': 3.1679782978244475e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.505394312876888e-06, 'clip_ratio/high_max': 1.4606259583160863e-05, 'clip_ratio/region_mean': 3.7185177234277944e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 434/1024 [19:50:20<26:12:59, 159.97s/it][AINFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 435/1024 [19:52:57<26:00:14, 158.94s/it][A
+                                                         [A{'loss': 0.0877, 'grad_norm': 0.004721642471849918, 'learning_rate': 1e-05, 'num_tokens': 382070478.0, 'completions/mean_length': 5243.8203125, 'completions/min_length': 576.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5156.1025390625, 'completions/min_terminated_length': 576.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.6875, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.6875, 'reward_std': 0.26538965106010437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016579966992139816, 'sampling/sampling_logp_difference/max': 6.7663984298706055, 'sampling/importance_sampling_ratio/min': 0.0011518355458974838, 'sampling/importance_sampling_ratio/mean': 0.9999414086341858, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7485036551952362, 'clip_ratio/low_mean': 2.3637440563106793e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.702175888520287e-06, 'clip_ratio/high_max': 1.4808703554081148e-05, 'clip_ratio/region_mean': 2.7339616224253405e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 435/1024 [19:52:57<26:00:14, 158.94s/it][AINFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 436/1024 [19:55:35<25:56:19, 158.81s/it][A
+                                                         [A{'loss': 0.0342, 'grad_norm': 0.00329192029312253, 'learning_rate': 1e-05, 'num_tokens': 382990245.0, 'completions/mean_length': 7021.1796875, 'completions/min_length': 1371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6872.56396484375, 'completions/min_terminated_length': 1371.0, 'completions/max_terminated_length': 15978.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019863136112689972, 'sampling/sampling_logp_difference/max': 6.058165073394775, 'sampling/importance_sampling_ratio/min': 0.0023386883549392223, 'sampling/importance_sampling_ratio/mean': 0.9999822378158569, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8693460151553154, 'clip_ratio/low_mean': 3.602651599976525e-05, 'clip_ratio/low_min': 4.348733455117326e-06, 'clip_ratio/high_mean': 1.1174359769938746e-05, 'clip_ratio/high_max': 3.1177480195765384e-05, 'clip_ratio/region_mean': 4.720087713394605e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 436/1024 [19:55:35<25:56:19, 158.81s/it][AINFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 437/1024 [19:58:32<26:45:53, 164.15s/it][A
+                                                         [A{'loss': 0.1009, 'grad_norm': 0.0051889242604374886, 'learning_rate': 1e-05, 'num_tokens': 383896717.0, 'completions/mean_length': 6917.625, 'completions/min_length': 945.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6452.0654296875, 'completions/min_terminated_length': 945.0, 'completions/max_terminated_length': 15344.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3448137044906616, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019528398290276527, 'sampling/sampling_logp_difference/max': 8.749983787536621, 'sampling/importance_sampling_ratio/min': 0.00015846389578655362, 'sampling/importance_sampling_ratio/mean': 0.9999983310699463, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8466897681355476, 'clip_ratio/low_mean': 4.9078003257818636e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7981737389382033e-06, 'clip_ratio/high_max': 1.1192694955752813e-05, 'clip_ratio/region_mean': 5.1876177280973934e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 437/1024 [19:58:32<26:45:53, 164.15s/it][AINFO 12-02 09:23:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:23:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:23:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:23:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 438/1024 [20:01:27<27:14:43, 167.38s/it][A
+                                                         [A{'loss': 0.0087, 'grad_norm': 0.002855573548004031, 'learning_rate': 1e-05, 'num_tokens': 384872622.0, 'completions/mean_length': 7487.5078125, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7346.2939453125, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 16175.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0209865253418684, 'sampling/sampling_logp_difference/max': 5.557258605957031, 'sampling/importance_sampling_ratio/min': 0.0038593418430536985, 'sampling/importance_sampling_ratio/mean': 0.9999386668205261, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9584660083055496, 'clip_ratio/low_mean': 3.8556312347282073e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.263948757303297e-06, 'clip_ratio/high_max': 2.3224948108691024e-05, 'clip_ratio/region_mean': 4.682026019509067e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 438/1024 [20:01:27<27:14:43, 167.38s/it][AINFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 439/1024 [20:04:04<26:43:19, 164.44s/it][A
+                                                         [A{'loss': 0.0523, 'grad_norm': 0.004437311552464962, 'learning_rate': 1e-05, 'num_tokens': 385744023.0, 'completions/mean_length': 6637.5078125, 'completions/min_length': 998.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6323.1044921875, 'completions/min_terminated_length': 998.0, 'completions/max_terminated_length': 16092.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2603819966316223, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019490888342261314, 'sampling/sampling_logp_difference/max': 5.834418296813965, 'sampling/importance_sampling_ratio/min': 0.002925124252215028, 'sampling/importance_sampling_ratio/mean': 0.9999136924743652, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8841215297579765, 'clip_ratio/low_mean': 2.98128834401723e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5429051245519076e-06, 'clip_ratio/high_max': 6.171620498207631e-06, 'clip_ratio/region_mean': 3.135578845103737e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 439/1024 [20:04:04<26:43:19, 164.44s/it][AINFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 440/1024 [20:06:32<25:51:24, 159.39s/it][A
+                                                         [A{'loss': -0.0075, 'grad_norm': 0.002463799435645342, 'learning_rate': 1e-05, 'num_tokens': 386525492.0, 'completions/mean_length': 5965.9765625, 'completions/min_length': 621.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5800.611328125, 'completions/min_terminated_length': 621.0, 'completions/max_terminated_length': 15143.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01946769654750824, 'sampling/sampling_logp_difference/max': 8.4989652633667, 'sampling/importance_sampling_ratio/min': 0.00020367901015561074, 'sampling/importance_sampling_ratio/mean': 0.9999351501464844, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8726934269070625, 'clip_ratio/low_mean': 5.443932013804442e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3262185752391815e-06, 'clip_ratio/high_max': 1.3304874300956726e-05, 'clip_ratio/region_mean': 5.776553894065728e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 440/1024 [20:06:32<25:51:24, 159.39s/it][AINFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 441/1024 [20:09:06<25:32:57, 157.77s/it][A
+                                                         [A{'loss': 0.0415, 'grad_norm': 0.0038990566972643137, 'learning_rate': 1e-05, 'num_tokens': 387404842.0, 'completions/mean_length': 6693.109375, 'completions/min_length': 1704.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6616.80322265625, 'completions/min_terminated_length': 1704.0, 'completions/max_terminated_length': 16115.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.31587693095207214, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020848294720053673, 'sampling/sampling_logp_difference/max': 6.749990940093994, 'sampling/importance_sampling_ratio/min': 0.0011708902893587947, 'sampling/importance_sampling_ratio/mean': 0.9999700784683228, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9430640190839767, 'clip_ratio/low_mean': 3.598771945689805e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6154040117253317e-06, 'clip_ratio/high_max': 1.0084711902891286e-05, 'clip_ratio/region_mean': 3.9603123695997056e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 441/1024 [20:09:06<25:32:57, 157.77s/it][AINFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 442/1024 [20:12:01<26:20:08, 162.90s/it][A
+                                                         [A{'loss': 0.099, 'grad_norm': 0.0018510994268581271, 'learning_rate': 1e-05, 'num_tokens': 388324475.0, 'completions/mean_length': 7045.6953125, 'completions/min_length': 926.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6505.46240234375, 'completions/min_terminated_length': 926.0, 'completions/max_terminated_length': 16162.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.32195523381233215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020547039806842804, 'sampling/sampling_logp_difference/max': 5.752217769622803, 'sampling/importance_sampling_ratio/min': 0.0031757301185280085, 'sampling/importance_sampling_ratio/mean': 0.9999024868011475, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8912066072225571, 'clip_ratio/low_mean': 5.234285907818048e-05, 'clip_ratio/low_min': 4.47803950009984e-06, 'clip_ratio/high_mean': 1.8656716065379442e-06, 'clip_ratio/high_max': 7.462686426151777e-06, 'clip_ratio/region_mean': 5.420853057103159e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 442/1024 [20:12:01<26:20:08, 162.90s/it][AINFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 443/1024 [20:14:40<26:07:54, 161.92s/it][A
+                                                         [A{'loss': 0.061, 'grad_norm': 0.004439481534063816, 'learning_rate': 1e-05, 'num_tokens': 389305644.0, 'completions/mean_length': 7480.0078125, 'completions/min_length': 1130.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7266.3125, 'completions/min_terminated_length': 1130.0, 'completions/max_terminated_length': 15734.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.31300368905067444, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01973455585539341, 'sampling/sampling_logp_difference/max': 4.899544715881348, 'sampling/importance_sampling_ratio/min': 0.007449973840266466, 'sampling/importance_sampling_ratio/mean': 0.9999762773513794, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8813760280609131, 'clip_ratio/low_mean': 6.165269871871715e-05, 'clip_ratio/low_min': 3.5272871627967106e-06, 'clip_ratio/high_mean': 6.26131770786742e-06, 'clip_ratio/high_max': 2.504527083146968e-05, 'clip_ratio/region_mean': 6.791401551708987e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 443/1024 [20:14:40<26:07:54, 161.92s/it][AINFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 444/1024 [20:17:11<25:32:19, 158.52s/it][A
+                                                         [A{'loss': -0.0068, 'grad_norm': 0.004181519150733948, 'learning_rate': 1e-05, 'num_tokens': 390229373.0, 'completions/mean_length': 7044.4453125, 'completions/min_length': 1229.0, 'completions/max_length': 15302.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7044.4453125, 'completions/min_terminated_length': 1229.0, 'completions/max_terminated_length': 15302.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.17700131237506866, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021211043000221252, 'sampling/sampling_logp_difference/max': 8.397781372070312, 'sampling/importance_sampling_ratio/min': 0.00022536676260642707, 'sampling/importance_sampling_ratio/mean': 1.0000314712524414, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9901906549930573, 'clip_ratio/low_mean': 3.662567087303614e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0245229304928216e-06, 'clip_ratio/high_max': 4.0980917219712865e-06, 'clip_ratio/region_mean': 3.76501939172158e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 444/1024 [20:17:11<25:32:19, 158.52s/it][AINFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 43%|████▎     | 445/1024 [20:20:09<26:26:12, 164.37s/it][A
+                                                         [A{'loss': 0.035, 'grad_norm': 0.002691390924155712, 'learning_rate': 1e-05, 'num_tokens': 391251141.0, 'completions/mean_length': 7815.8125, 'completions/min_length': 1350.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7244.6005859375, 'completions/min_terminated_length': 1350.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.31222954392433167, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018415704369544983, 'sampling/sampling_logp_difference/max': 4.864527702331543, 'sampling/importance_sampling_ratio/min': 0.007715471088886261, 'sampling/importance_sampling_ratio/mean': 0.99993896484375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8278292864561081, 'clip_ratio/low_mean': 5.29995777469594e-05, 'clip_ratio/low_min': 3.708758640641463e-06, 'clip_ratio/high_mean': 3.7274680266818905e-06, 'clip_ratio/high_max': 1.4909872106727562e-05, 'clip_ratio/region_mean': 5.672704537573736e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 445/1024 [20:20:09<26:26:12, 164.37s/it][AINFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▎     | 446/1024 [20:22:25<25:01:21, 155.85s/it][A
+                                                         [A{'loss': 0.1153, 'grad_norm': 0.0069543467834591866, 'learning_rate': 1e-05, 'num_tokens': 391956196.0, 'completions/mean_length': 5305.1796875, 'completions/min_length': 1017.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5217.94482421875, 'completions/min_terminated_length': 1017.0, 'completions/max_terminated_length': 15202.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017318082973361015, 'sampling/sampling_logp_difference/max': 5.996687889099121, 'sampling/importance_sampling_ratio/min': 0.0024869756307452917, 'sampling/importance_sampling_ratio/mean': 1.0000190734863281, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8100772425532341, 'clip_ratio/low_mean': 3.196247394043894e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.629899417021079e-06, 'clip_ratio/high_max': 2.1858722902834415e-05, 'clip_ratio/region_mean': 3.859237290271267e-05, 'epoch': 0.41}
+
+ 44%|████▎     | 446/1024 [20:22:25<25:01:21, 155.85s/it][AINFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▎     | 447/1024 [20:25:12<25:32:09, 159.32s/it][A
+                                                         [A{'loss': 0.0883, 'grad_norm': 0.0065611582249403, 'learning_rate': 1e-05, 'num_tokens': 392908430.0, 'completions/mean_length': 7299.578125, 'completions/min_length': 1008.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6930.29248046875, 'completions/min_terminated_length': 1008.0, 'completions/max_terminated_length': 15300.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02127375639975071, 'sampling/sampling_logp_difference/max': 11.873339653015137, 'sampling/importance_sampling_ratio/min': 6.9738744059577584e-06, 'sampling/importance_sampling_ratio/mean': 0.9999696016311646, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9955824315547943, 'clip_ratio/low_mean': 5.289376917971822e-05, 'clip_ratio/low_min': 4.21926688431995e-06, 'clip_ratio/high_mean': 8.056288947955181e-06, 'clip_ratio/high_max': 2.461934036546154e-05, 'clip_ratio/region_mean': 6.0950058468733914e-05, 'epoch': 0.41}
+
+ 44%|████▎     | 447/1024 [20:25:12<25:32:09, 159.32s/it][AINFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 448/1024 [20:28:04<26:04:47, 163.00s/it][A
+                                                         [A{'loss': 0.0725, 'grad_norm': 0.0032975098583847284, 'learning_rate': 1e-05, 'num_tokens': 393788286.0, 'completions/mean_length': 6702.9375, 'completions/min_length': 469.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6390.64501953125, 'completions/min_terminated_length': 469.0, 'completions/max_terminated_length': 16221.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.27168765664100647, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019461583346128464, 'sampling/sampling_logp_difference/max': 8.160128593444824, 'sampling/importance_sampling_ratio/min': 0.00028582560480572283, 'sampling/importance_sampling_ratio/mean': 0.9999115467071533, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.82919991761446, 'clip_ratio/low_mean': 3.89272447591793e-05, 'clip_ratio/low_min': 4.047796210215893e-06, 'clip_ratio/high_mean': 7.412756531266496e-06, 'clip_ratio/high_max': 2.4339562514796853e-05, 'clip_ratio/region_mean': 4.6340001517819474e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 448/1024 [20:28:04<26:04:47, 163.00s/it][AINFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 449/1024 [20:31:27<27:55:34, 174.84s/it][A
+                                                         [A{'loss': 0.1149, 'grad_norm': 0.0032787907402962446, 'learning_rate': 1e-05, 'num_tokens': 394638159.0, 'completions/mean_length': 6468.9453125, 'completions/min_length': 808.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 5536.7607421875, 'completions/min_terminated_length': 808.0, 'completions/max_terminated_length': 15244.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016151495277881622, 'sampling/sampling_logp_difference/max': 8.999967575073242, 'sampling/importance_sampling_ratio/min': 0.00012341380352154374, 'sampling/importance_sampling_ratio/mean': 0.9999669790267944, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6471721827983856, 'clip_ratio/low_mean': 3.195798365140945e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.951899765932467e-06, 'clip_ratio/high_max': 2.3807599063729867e-05, 'clip_ratio/region_mean': 3.790988330365508e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 449/1024 [20:31:27<27:55:34, 174.84s/it][AINFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 450/1024 [20:34:11<27:22:28, 171.69s/it][A
+                                                         [A{'loss': 0.0967, 'grad_norm': 0.0038375966250896454, 'learning_rate': 1e-05, 'num_tokens': 395493872.0, 'completions/mean_length': 6547.3203125, 'completions/min_length': 587.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6230.0078125, 'completions/min_terminated_length': 587.0, 'completions/max_terminated_length': 15931.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.30798619985580444, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019957344979047775, 'sampling/sampling_logp_difference/max': 8.739748001098633, 'sampling/importance_sampling_ratio/min': 0.00016009423416107893, 'sampling/importance_sampling_ratio/mean': 0.9999747276306152, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9123960956931114, 'clip_ratio/low_mean': 6.035319393049576e-05, 'clip_ratio/low_min': 4.063190772285452e-06, 'clip_ratio/high_mean': 5.61768172246957e-06, 'clip_ratio/high_max': 2.247072688987828e-05, 'clip_ratio/region_mean': 6.597087667614687e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 450/1024 [20:34:11<27:22:28, 171.69s/it][AINFO 12-02 09:59:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:59:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:59:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:59:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 451/1024 [20:36:53<26:51:44, 168.77s/it][A
+                                                         [A{'loss': 0.0656, 'grad_norm': 0.003903903067111969, 'learning_rate': 1e-05, 'num_tokens': 396320254.0, 'completions/mean_length': 6291.859375, 'completions/min_length': 823.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6131.6669921875, 'completions/min_terminated_length': 823.0, 'completions/max_terminated_length': 15058.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2569621503353119, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020753150805830956, 'sampling/sampling_logp_difference/max': 11.93381404876709, 'sampling/importance_sampling_ratio/min': 6.564632712979801e-06, 'sampling/importance_sampling_ratio/mean': 0.9999452829360962, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9841655194759369, 'clip_ratio/low_mean': 2.315102483407827e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5112059322273126e-06, 'clip_ratio/high_max': 1.404482372890925e-05, 'clip_ratio/region_mean': 2.6662230766305584e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 451/1024 [20:36:53<26:51:44, 168.77s/it][AINFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 452/1024 [20:39:55<27:27:47, 172.84s/it][A
+                                                         [A{'loss': 0.0511, 'grad_norm': 0.005152889993041754, 'learning_rate': 1e-05, 'num_tokens': 397327029.0, 'completions/mean_length': 7692.4296875, 'completions/min_length': 1269.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7339.11376953125, 'completions/min_terminated_length': 1269.0, 'completions/max_terminated_length': 15966.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02036213129758835, 'sampling/sampling_logp_difference/max': 9.897988319396973, 'sampling/importance_sampling_ratio/min': 5.027571751270443e-05, 'sampling/importance_sampling_ratio/mean': 0.9999433755874634, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.94080401211977, 'clip_ratio/low_mean': 3.547988831087423e-05, 'clip_ratio/low_min': 3.3967392027989263e-06, 'clip_ratio/high_mean': 4.615214265868417e-06, 'clip_ratio/high_max': 1.5189204987109406e-05, 'clip_ratio/region_mean': 4.009510257674265e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 452/1024 [20:39:55<27:27:47, 172.84s/it][AINFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 453/1024 [20:42:43<27:09:15, 171.20s/it][A
+                                                         [A{'loss': 0.0182, 'grad_norm': 0.0035838852636516094, 'learning_rate': 1e-05, 'num_tokens': 398237536.0, 'completions/mean_length': 6968.0859375, 'completions/min_length': 893.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6742.1044921875, 'completions/min_terminated_length': 893.0, 'completions/max_terminated_length': 15305.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020200733095407486, 'sampling/sampling_logp_difference/max': 6.030359745025635, 'sampling/importance_sampling_ratio/min': 0.002404628787189722, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9254838973283768, 'clip_ratio/low_mean': 2.335082047011383e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.0586507970583625e-06, 'clip_ratio/high_max': 1.733157705530175e-05, 'clip_ratio/region_mean': 2.9409470812424843e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 453/1024 [20:42:43<27:09:15, 171.20s/it][AINFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 454/1024 [20:45:54<28:02:50, 177.14s/it][A
+                                                         [A{'loss': 0.0412, 'grad_norm': 0.0036290446296334267, 'learning_rate': 1e-05, 'num_tokens': 399373298.0, 'completions/mean_length': 8711.078125, 'completions/min_length': 1049.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8199.55078125, 'completions/min_terminated_length': 1049.0, 'completions/max_terminated_length': 16309.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.19568344950675964, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0201371181756258, 'sampling/sampling_logp_difference/max': 9.291923522949219, 'sampling/importance_sampling_ratio/min': 9.216561011271551e-05, 'sampling/importance_sampling_ratio/mean': 1.000042200088501, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8735406622290611, 'clip_ratio/low_mean': 3.311113533754906e-05, 'clip_ratio/low_min': 6.725854291289579e-06, 'clip_ratio/high_mean': 1.116230919251393e-06, 'clip_ratio/high_max': 4.464923677005572e-06, 'clip_ratio/region_mean': 3.422736637048729e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 454/1024 [20:45:54<28:02:50, 177.14s/it][AINFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 455/1024 [20:48:28<26:56:16, 170.43s/it][A
+                                                         [A{'loss': 0.0633, 'grad_norm': 0.004067540634423494, 'learning_rate': 1e-05, 'num_tokens': 400273708.0, 'completions/mean_length': 6891.078125, 'completions/min_length': 827.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6663.24853515625, 'completions/min_terminated_length': 827.0, 'completions/max_terminated_length': 14737.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.27274850010871887, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019800148904323578, 'sampling/sampling_logp_difference/max': 14.731733322143555, 'sampling/importance_sampling_ratio/min': 4.0002717582865444e-07, 'sampling/importance_sampling_ratio/mean': 0.9999425411224365, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8689641878008842, 'clip_ratio/low_mean': 3.3217100849469716e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.675666151702899e-06, 'clip_ratio/high_max': 3.4702664606811595e-05, 'clip_ratio/region_mean': 4.189276808119757e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 455/1024 [20:48:28<26:56:16, 170.43s/it][AINFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 456/1024 [20:51:10<26:28:46, 167.83s/it][A
+                                                         [A{'loss': 0.0743, 'grad_norm': 0.0026191689539700747, 'learning_rate': 1e-05, 'num_tokens': 401177497.0, 'completions/mean_length': 6899.3515625, 'completions/min_length': 1149.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6748.8017578125, 'completions/min_terminated_length': 1149.0, 'completions/max_terminated_length': 15234.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.20251333713531494, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021380646154284477, 'sampling/sampling_logp_difference/max': 6.3249406814575195, 'sampling/importance_sampling_ratio/min': 0.0017910725437104702, 'sampling/importance_sampling_ratio/mean': 0.9999812841415405, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9442604705691338, 'clip_ratio/low_mean': 3.564125790944672e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.347927066803095e-07, 'clip_ratio/high_max': 2.939170826721238e-06, 'clip_ratio/region_mean': 3.6376050502440194e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 456/1024 [20:51:10<26:28:46, 167.83s/it][AINFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 10:18:20,061 - math_verify.grader - WARNING - Timeout during comparison
+
+ 45%|████▍     | 457/1024 [20:54:26<27:45:22, 176.23s/it][A
+                                                         [A{'loss': 0.0674, 'grad_norm': 0.003141516586765647, 'learning_rate': 1e-05, 'num_tokens': 402115812.0, 'completions/mean_length': 7175.8359375, 'completions/min_length': 919.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7029.6748046875, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 16226.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.21040895581245422, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01970163732767105, 'sampling/sampling_logp_difference/max': 6.672667980194092, 'sampling/importance_sampling_ratio/min': 0.001265019178390503, 'sampling/importance_sampling_ratio/mean': 0.9999862909317017, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8653769046068192, 'clip_ratio/low_mean': 2.57235833487357e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.24901032197522e-06, 'clip_ratio/high_max': 8.99604128790088e-06, 'clip_ratio/region_mean': 2.797259367071092e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 457/1024 [20:54:26<27:45:22, 176.23s/it][AINFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 458/1024 [20:57:20<27:37:15, 175.68s/it][A
+                                                         [A{'loss': 0.0751, 'grad_norm': 0.001980370609089732, 'learning_rate': 1e-05, 'num_tokens': 403048385.0, 'completions/mean_length': 7090.8515625, 'completions/min_length': 606.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6791.072265625, 'completions/min_terminated_length': 606.0, 'completions/max_terminated_length': 16250.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021090596914291382, 'sampling/sampling_logp_difference/max': 13.47822093963623, 'sampling/importance_sampling_ratio/min': 1.4011449138706666e-06, 'sampling/importance_sampling_ratio/mean': 0.9999619722366333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9437825232744217, 'clip_ratio/low_mean': 3.116219727417047e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.700014874790213e-06, 'clip_ratio/high_max': 1.0800059499160852e-05, 'clip_ratio/region_mean': 3.3862211807900167e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 458/1024 [20:57:20<27:37:15, 175.68s/it][AINFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 459/1024 [20:59:57<26:42:06, 170.14s/it][A
+                                                         [A{'loss': 0.059, 'grad_norm': 0.003833206370472908, 'learning_rate': 1e-05, 'num_tokens': 403968037.0, 'completions/mean_length': 7033.65625, 'completions/min_length': 1007.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6809.24853515625, 'completions/min_terminated_length': 1007.0, 'completions/max_terminated_length': 16175.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.28460076451301575, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019913772121071815, 'sampling/sampling_logp_difference/max': 6.1218976974487305, 'sampling/importance_sampling_ratio/min': 0.0021942879538983107, 'sampling/importance_sampling_ratio/mean': 1.0000317096710205, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8789731040596962, 'clip_ratio/low_mean': 4.8558076969129615e-05, 'clip_ratio/low_min': 4.8952420002024155e-06, 'clip_ratio/high_mean': 6.370712640091369e-06, 'clip_ratio/high_max': 2.5482850560365478e-05, 'clip_ratio/region_mean': 5.4928788131292094e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 459/1024 [20:59:58<26:42:06, 170.14s/it][AINFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 460/1024 [21:02:57<27:06:18, 173.01s/it][A
+                                                         [A{'loss': 0.1581, 'grad_norm': 0.005315023008733988, 'learning_rate': 1e-05, 'num_tokens': 404881584.0, 'completions/mean_length': 6992.8984375, 'completions/min_length': 754.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6611.14599609375, 'completions/min_terminated_length': 754.0, 'completions/max_terminated_length': 16107.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01872519962489605, 'sampling/sampling_logp_difference/max': 9.998538970947266, 'sampling/importance_sampling_ratio/min': 4.546630952972919e-05, 'sampling/importance_sampling_ratio/mean': 1.0000758171081543, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.857115626335144, 'clip_ratio/low_mean': 6.774969961043098e-05, 'clip_ratio/low_min': 3.189914878021227e-06, 'clip_ratio/high_mean': 1.0172194606639096e-06, 'clip_ratio/high_max': 4.068877842655638e-06, 'clip_ratio/region_mean': 6.876691895740805e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 460/1024 [21:02:57<27:06:18, 173.01s/it][AINFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 461/1024 [21:05:39<26:31:46, 169.64s/it][A
+                                                         [A{'loss': 0.1076, 'grad_norm': 0.0074885934591293335, 'learning_rate': 1e-05, 'num_tokens': 405749105.0, 'completions/mean_length': 6623.2578125, 'completions/min_length': 221.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6226.4794921875, 'completions/min_terminated_length': 221.0, 'completions/max_terminated_length': 16095.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01930626854300499, 'sampling/sampling_logp_difference/max': 6.748711109161377, 'sampling/importance_sampling_ratio/min': 0.0011723897187039256, 'sampling/importance_sampling_ratio/mean': 0.9999799728393555, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8803941905498505, 'clip_ratio/low_mean': 3.3195502112448594e-05, 'clip_ratio/low_min': 5.25188033861923e-06, 'clip_ratio/high_mean': 2.9176186444601626e-06, 'clip_ratio/high_max': 1.167047457784065e-05, 'clip_ratio/region_mean': 3.611312064322192e-05, 'epoch': 0.42}
+
+ 45%|████▌     | 461/1024 [21:05:39<26:31:46, 169.64s/it][AINFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 462/1024 [21:08:19<26:02:32, 166.82s/it][A
+                                                         [A{'loss': 0.0536, 'grad_norm': 0.003960717935115099, 'learning_rate': 1e-05, 'num_tokens': 406704618.0, 'completions/mean_length': 7244.8203125, 'completions/min_length': 1227.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6647.5419921875, 'completions/min_terminated_length': 1227.0, 'completions/max_terminated_length': 15032.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2880108058452606, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02019711770117283, 'sampling/sampling_logp_difference/max': 10.98397159576416, 'sampling/importance_sampling_ratio/min': 1.69715603988152e-05, 'sampling/importance_sampling_ratio/mean': 0.9999812841415405, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9202689751982689, 'clip_ratio/low_mean': 5.09268712676203e-05, 'clip_ratio/low_min': 1.1170248626513057e-05, 'clip_ratio/high_mean': 1.0293827017449075e-06, 'clip_ratio/high_max': 4.11753080697963e-06, 'clip_ratio/region_mean': 5.195625465148623e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 462/1024 [21:08:19<26:02:32, 166.82s/it][AINFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 463/1024 [21:11:02<25:47:14, 165.48s/it][A
+                                                         [A{'loss': 0.1054, 'grad_norm': 0.003602087963372469, 'learning_rate': 1e-05, 'num_tokens': 407677177.0, 'completions/mean_length': 7462.0546875, 'completions/min_length': 669.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6867.2587890625, 'completions/min_terminated_length': 669.0, 'completions/max_terminated_length': 16296.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.35482609272003174, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01856713369488716, 'sampling/sampling_logp_difference/max': 7.155362129211426, 'sampling/importance_sampling_ratio/min': 0.0007806668290868402, 'sampling/importance_sampling_ratio/mean': 0.9999440312385559, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8141553401947021, 'clip_ratio/low_mean': 5.367962035052187e-05, 'clip_ratio/low_min': 6.5083827394119e-06, 'clip_ratio/high_mean': 1.0519701334033016e-05, 'clip_ratio/high_max': 2.874629831239872e-05, 'clip_ratio/region_mean': 6.419932219614566e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 463/1024 [21:11:02<25:47:14, 165.48s/it][AINFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 464/1024 [21:13:44<25:35:07, 164.48s/it][A
+                                                         [A{'loss': 0.061, 'grad_norm': 0.004038481041789055, 'learning_rate': 1e-05, 'num_tokens': 408552512.0, 'completions/mean_length': 6683.1796875, 'completions/min_length': 775.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6529.19873046875, 'completions/min_terminated_length': 775.0, 'completions/max_terminated_length': 15750.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.25620076060295105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02077356167137623, 'sampling/sampling_logp_difference/max': 10.014501571655273, 'sampling/importance_sampling_ratio/min': 4.474630986806005e-05, 'sampling/importance_sampling_ratio/mean': 1.0000439882278442, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9070071652531624, 'clip_ratio/low_mean': 3.5997712757307454e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.973188073468918e-06, 'clip_ratio/high_max': 2.6413443720230134e-05, 'clip_ratio/region_mean': 4.497090230870526e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 464/1024 [21:13:44<25:35:07, 164.48s/it][AINFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 465/1024 [21:16:21<25:12:34, 162.35s/it][A
+                                                         [A{'loss': 0.0295, 'grad_norm': 0.004457853268831968, 'learning_rate': 1e-05, 'num_tokens': 409399257.0, 'completions/mean_length': 6472.9453125, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5985.51611328125, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 15864.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.20517179369926453, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020475786179304123, 'sampling/sampling_logp_difference/max': 6.343741416931152, 'sampling/importance_sampling_ratio/min': 0.0017577135004103184, 'sampling/importance_sampling_ratio/mean': 0.9999473690986633, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8807859197258949, 'clip_ratio/low_mean': 3.225401701456576e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.292822495699511e-06, 'clip_ratio/high_max': 1.7171289982798044e-05, 'clip_ratio/region_mean': 3.654683996501262e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 465/1024 [21:16:21<25:12:34, 162.35s/it][AINFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 466/1024 [21:18:57<24:52:25, 160.48s/it][A
+                                                         [A{'loss': 0.1217, 'grad_norm': 0.0033953245729207993, 'learning_rate': 1e-05, 'num_tokens': 410185645.0, 'completions/mean_length': 5989.78125, 'completions/min_length': 610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5654.48388671875, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 15896.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3735082745552063, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017986344173550606, 'sampling/sampling_logp_difference/max': 10.935420036315918, 'sampling/importance_sampling_ratio/min': 1.781588616722729e-05, 'sampling/importance_sampling_ratio/mean': 0.9999676942825317, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8479711338877678, 'clip_ratio/low_mean': 5.706528349946893e-05, 'clip_ratio/low_min': 2.5156462925224332e-05, 'clip_ratio/high_mean': 1.584139977239829e-05, 'clip_ratio/high_max': 5.442162637336878e-05, 'clip_ratio/region_mean': 7.290668463610928e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 466/1024 [21:18:57<24:52:25, 160.48s/it][AINFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 467/1024 [21:22:03<26:01:27, 168.20s/it][A
+                                                         [A{'loss': 0.0651, 'grad_norm': 0.002381247701123357, 'learning_rate': 1e-05, 'num_tokens': 411268974.0, 'completions/mean_length': 8299.9453125, 'completions/min_length': 1123.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8171.62744140625, 'completions/min_terminated_length': 1123.0, 'completions/max_terminated_length': 16103.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2477683573961258, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021354343742132187, 'sampling/sampling_logp_difference/max': 7.4999823570251465, 'sampling/importance_sampling_ratio/min': 0.000553094083443284, 'sampling/importance_sampling_ratio/mean': 0.9999679327011108, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9363152608275414, 'clip_ratio/low_mean': 5.2673244681500364e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.2673244681500364e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 467/1024 [21:22:03<26:01:27, 168.20s/it][AINFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 468/1024 [21:24:56<26:11:28, 169.58s/it][A
+                                                         [A{'loss': -0.003, 'grad_norm': 0.006341467145830393, 'learning_rate': 1e-05, 'num_tokens': 412238117.0, 'completions/mean_length': 7434.0546875, 'completions/min_length': 898.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7219.25634765625, 'completions/min_terminated_length': 898.0, 'completions/max_terminated_length': 14838.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02139873616397381, 'sampling/sampling_logp_difference/max': 6.249992847442627, 'sampling/importance_sampling_ratio/min': 0.0019304680172353983, 'sampling/importance_sampling_ratio/mean': 1.0000128746032715, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.981913685798645, 'clip_ratio/low_mean': 2.84454882830687e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1446739992825314e-06, 'clip_ratio/high_max': 8.578695997130126e-06, 'clip_ratio/region_mean': 3.059016239603807e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 468/1024 [21:24:56<26:11:28, 169.58s/it][AINFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 469/1024 [21:27:44<26:03:18, 169.01s/it][A
+                                                         [A{'loss': 0.0562, 'grad_norm': 0.002621602965518832, 'learning_rate': 1e-05, 'num_tokens': 413182860.0, 'completions/mean_length': 7211.1796875, 'completions/min_length': 280.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7138.95263671875, 'completions/min_terminated_length': 280.0, 'completions/max_terminated_length': 15871.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.34716784954071045, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020250719040632248, 'sampling/sampling_logp_difference/max': 9.874974250793457, 'sampling/importance_sampling_ratio/min': 5.1446182624204084e-05, 'sampling/importance_sampling_ratio/mean': 0.9999529123306274, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9307222217321396, 'clip_ratio/low_mean': 5.4699471832009294e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.150076049176278e-06, 'clip_ratio/high_max': 1.7187987396027893e-05, 'clip_ratio/region_mean': 5.9849548279089504e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 469/1024 [21:27:44<26:03:18, 169.01s/it][AINFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 470/1024 [21:30:05<24:43:50, 160.70s/it][A
+                                                         [A{'loss': 0.0657, 'grad_norm': 0.0035241330042481422, 'learning_rate': 1e-05, 'num_tokens': 413885963.0, 'completions/mean_length': 5349.4296875, 'completions/min_length': 983.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5174.2783203125, 'completions/min_terminated_length': 983.0, 'completions/max_terminated_length': 15726.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.25330984592437744, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01975759118795395, 'sampling/sampling_logp_difference/max': 7.938032150268555, 'sampling/importance_sampling_ratio/min': 0.0003569081309251487, 'sampling/importance_sampling_ratio/mean': 0.9999449253082275, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0213474333286285, 'clip_ratio/low_mean': 4.7740833792886406e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9217885614561965e-06, 'clip_ratio/high_max': 1.0867412584047997e-05, 'clip_ratio/region_mean': 5.16626223543426e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 470/1024 [21:30:05<24:43:50, 160.70s/it][AINFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 471/1024 [21:32:57<25:11:18, 163.97s/it][A
+                                                         [A{'loss': 0.0635, 'grad_norm': 0.0014164346503093839, 'learning_rate': 1e-05, 'num_tokens': 414870560.0, 'completions/mean_length': 7542.8515625, 'completions/min_length': 1359.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7257.65283203125, 'completions/min_terminated_length': 1359.0, 'completions/max_terminated_length': 15357.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.20753081142902374, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020874422043561935, 'sampling/sampling_logp_difference/max': 9.651104927062988, 'sampling/importance_sampling_ratio/min': 6.435441900976002e-05, 'sampling/importance_sampling_ratio/mean': 1.0000402927398682, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8882969543337822, 'clip_ratio/low_mean': 2.699725871480041e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.673786522995215e-06, 'clip_ratio/high_max': 1.469514609198086e-05, 'clip_ratio/region_mean': 3.0671045237795624e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 471/1024 [21:32:57<25:11:18, 163.97s/it][AINFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 472/1024 [21:35:49<25:30:47, 166.39s/it][A
+                                                         [A{'loss': 0.0567, 'grad_norm': 0.0026956009678542614, 'learning_rate': 1e-05, 'num_tokens': 415825252.0, 'completions/mean_length': 7286.90625, 'completions/min_length': 977.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6993.451171875, 'completions/min_terminated_length': 977.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0202642735093832, 'sampling/sampling_logp_difference/max': 6.229649543762207, 'sampling/importance_sampling_ratio/min': 0.0019701423589140177, 'sampling/importance_sampling_ratio/mean': 0.999917209148407, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9254636988043785, 'clip_ratio/low_mean': 3.673103901746799e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.17456874401978e-06, 'clip_ratio/high_max': 1.669827497607912e-05, 'clip_ratio/region_mean': 4.090560787517461e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 472/1024 [21:35:49<25:30:47, 166.39s/it][AINFO 12-02 11:00:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 473/1024 [21:38:52<26:13:32, 171.35s/it][A
+                                                         [A{'loss': 0.0577, 'grad_norm': 0.0022128887940198183, 'learning_rate': 1e-05, 'num_tokens': 416774011.0, 'completions/mean_length': 7244.7421875, 'completions/min_length': 1010.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6716.0244140625, 'completions/min_terminated_length': 1010.0, 'completions/max_terminated_length': 15908.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2937847375869751, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01840684749186039, 'sampling/sampling_logp_difference/max': 6.499997138977051, 'sampling/importance_sampling_ratio/min': 0.0015034435782581568, 'sampling/importance_sampling_ratio/mean': 1.000002384185791, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7817923128604889, 'clip_ratio/low_mean': 3.818475033767754e-05, 'clip_ratio/low_min': 7.20606476534158e-06, 'clip_ratio/high_mean': 2.2905113610249828e-06, 'clip_ratio/high_max': 9.162045444099931e-06, 'clip_ratio/region_mean': 4.047526181238936e-05, 'epoch': 0.44}
+
+ 46%|████▌     | 473/1024 [21:38:52<26:13:32, 171.35s/it][AINFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▋     | 474/1024 [21:42:07<27:17:09, 178.60s/it][A
+                                                         [A{'loss': 0.0504, 'grad_norm': 0.0034676652867347, 'learning_rate': 1e-05, 'num_tokens': 417951311.0, 'completions/mean_length': 9042.90625, 'completions/min_length': 997.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 8283.482421875, 'completions/min_terminated_length': 997.0, 'completions/max_terminated_length': 16254.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02112819254398346, 'sampling/sampling_logp_difference/max': 8.239109992980957, 'sampling/importance_sampling_ratio/min': 0.0002641192404553294, 'sampling/importance_sampling_ratio/mean': 0.9999234080314636, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9306210279464722, 'clip_ratio/low_mean': 3.636896872194484e-05, 'clip_ratio/low_min': 3.1460788250115e-06, 'clip_ratio/high_mean': 3.0582178283111716e-06, 'clip_ratio/high_max': 1.2232871313244686e-05, 'clip_ratio/region_mean': 3.9427186266038916e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 474/1024 [21:42:07<27:17:09, 178.60s/it][AINFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▋     | 475/1024 [21:44:40<26:03:40, 170.89s/it][A
+                                                         [A{'loss': 0.0704, 'grad_norm': 0.0030218157917261124, 'learning_rate': 1e-05, 'num_tokens': 418836184.0, 'completions/mean_length': 6763.6328125, 'completions/min_length': 826.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6610.9287109375, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 15721.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.30091896653175354, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021101050078868866, 'sampling/sampling_logp_difference/max': 7.880997180938721, 'sampling/importance_sampling_ratio/min': 0.0003778560785576701, 'sampling/importance_sampling_ratio/mean': 0.9999898672103882, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9879302233457565, 'clip_ratio/low_mean': 4.3606626604741905e-05, 'clip_ratio/low_min': 3.5752079838857753e-06, 'clip_ratio/high_mean': 8.202394610634656e-06, 'clip_ratio/high_max': 2.5187824576278217e-05, 'clip_ratio/region_mean': 5.1809020988002885e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 475/1024 [21:44:40<26:03:40, 170.89s/it][AINFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▋     | 476/1024 [21:47:20<25:31:16, 167.66s/it][A
+                                                         [A{'loss': 0.0592, 'grad_norm': 0.002881827764213085, 'learning_rate': 1e-05, 'num_tokens': 419726192.0, 'completions/mean_length': 6794.25, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6564.09619140625, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 15675.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0210823193192482, 'sampling/sampling_logp_difference/max': 13.897041320800781, 'sampling/importance_sampling_ratio/min': 9.217044407705544e-07, 'sampling/importance_sampling_ratio/mean': 0.9999275207519531, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0259138569235802, 'clip_ratio/low_mean': 6.21261324340594e-05, 'clip_ratio/low_min': 3.6509140954876784e-06, 'clip_ratio/high_mean': 2.6610464374243747e-06, 'clip_ratio/high_max': 1.0644185749697499e-05, 'clip_ratio/region_mean': 6.478717887148377e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 476/1024 [21:47:20<25:31:16, 167.66s/it][AINFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 477/1024 [21:49:42<24:18:07, 159.94s/it][A
+                                                         [A{'loss': 0.0484, 'grad_norm': 0.00289533962495625, 'learning_rate': 1e-05, 'num_tokens': 420468867.0, 'completions/mean_length': 5648.2109375, 'completions/min_length': 935.0, 'completions/max_length': 14281.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5648.2109375, 'completions/min_terminated_length': 935.0, 'completions/max_terminated_length': 14281.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2675113081932068, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018499158322811127, 'sampling/sampling_logp_difference/max': 6.590811729431152, 'sampling/importance_sampling_ratio/min': 0.001372925122268498, 'sampling/importance_sampling_ratio/mean': 0.9998449087142944, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.88894472271204, 'clip_ratio/low_mean': 4.70996876629215e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7721512196876574e-06, 'clip_ratio/high_max': 1.108860487875063e-05, 'clip_ratio/region_mean': 4.9871839337356505e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 477/1024 [21:49:42<24:18:07, 159.94s/it][AINFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 478/1024 [21:52:34<24:46:33, 163.36s/it][A
+                                                         [A{'loss': 0.0012, 'grad_norm': 0.002749695209786296, 'learning_rate': 1e-05, 'num_tokens': 421280881.0, 'completions/mean_length': 6188.359375, 'completions/min_length': 1085.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6026.52392578125, 'completions/min_terminated_length': 1085.0, 'completions/max_terminated_length': 15657.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.15991678833961487, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.018456483259797096, 'sampling/sampling_logp_difference/max': 5.386401653289795, 'sampling/importance_sampling_ratio/min': 0.004578418098390102, 'sampling/importance_sampling_ratio/mean': 0.9999796152114868, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8476063013076782, 'clip_ratio/low_mean': 2.4103785335682915e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1883936394951888e-06, 'clip_ratio/high_max': 4.753574557980755e-06, 'clip_ratio/region_mean': 2.5292179316238617e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 478/1024 [21:52:34<24:46:33, 163.36s/it][AINFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 479/1024 [21:55:21<24:55:26, 164.64s/it][A
+                                                         [A{'loss': 0.0347, 'grad_norm': 0.005116373300552368, 'learning_rate': 1e-05, 'num_tokens': 422177822.0, 'completions/mean_length': 6864.3515625, 'completions/min_length': 1065.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6635.88037109375, 'completions/min_terminated_length': 1065.0, 'completions/max_terminated_length': 15112.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01979806460440159, 'sampling/sampling_logp_difference/max': 8.498090744018555, 'sampling/importance_sampling_ratio/min': 0.00020385721290949732, 'sampling/importance_sampling_ratio/mean': 0.9999545216560364, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8666203916072845, 'clip_ratio/low_mean': 4.786080125995795e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0339978757656354e-05, 'clip_ratio/high_max': 4.1359915030625416e-05, 'clip_ratio/region_mean': 5.8200780586048495e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 479/1024 [21:55:21<24:55:26, 164.64s/it][AINFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 480/1024 [21:58:20<25:30:29, 168.80s/it][A
+                                                         [A{'loss': 0.019, 'grad_norm': 0.0020944855641573668, 'learning_rate': 1e-05, 'num_tokens': 423096576.0, 'completions/mean_length': 7023.828125, 'completions/min_length': 780.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6799.18408203125, 'completions/min_terminated_length': 780.0, 'completions/max_terminated_length': 15841.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.20858672261238098, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020111342892050743, 'sampling/sampling_logp_difference/max': 5.900396347045898, 'sampling/importance_sampling_ratio/min': 0.0027383591514080763, 'sampling/importance_sampling_ratio/mean': 0.9999480247497559, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9098334684967995, 'clip_ratio/low_mean': 4.153812756158004e-05, 'clip_ratio/low_min': 3.606462769312202e-06, 'clip_ratio/high_mean': 3.6361936395223893e-06, 'clip_ratio/high_max': 1.4544774558089557e-05, 'clip_ratio/region_mean': 4.51743208031985e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 480/1024 [21:58:20<25:30:29, 168.80s/it][AINFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 481/1024 [22:01:00<25:03:13, 166.10s/it][A
+                                                         [A{'loss': 0.1151, 'grad_norm': 0.003897767048329115, 'learning_rate': 1e-05, 'num_tokens': 423968050.0, 'completions/mean_length': 6666.828125, 'completions/min_length': 872.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6512.587890625, 'completions/min_terminated_length': 872.0, 'completions/max_terminated_length': 15527.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3527044653892517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019923247396945953, 'sampling/sampling_logp_difference/max': 5.7499775886535645, 'sampling/importance_sampling_ratio/min': 0.0031828521750867367, 'sampling/importance_sampling_ratio/mean': 0.9999406337738037, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9162466824054718, 'clip_ratio/low_mean': 5.0774355258909054e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2372795026749372e-05, 'clip_ratio/high_max': 3.256236095694476e-05, 'clip_ratio/region_mean': 6.314715119515313e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 481/1024 [22:01:00<25:03:13, 166.10s/it][AINFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 482/1024 [22:03:57<25:31:21, 169.52s/it][A
+                                                         [A{'loss': 0.042, 'grad_norm': 0.003038195427507162, 'learning_rate': 1e-05, 'num_tokens': 424902953.0, 'completions/mean_length': 7159.8046875, 'completions/min_length': 1022.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7013.38916015625, 'completions/min_terminated_length': 1022.0, 'completions/max_terminated_length': 16223.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019014043733477592, 'sampling/sampling_logp_difference/max': 11.809727668762207, 'sampling/importance_sampling_ratio/min': 7.431909580191132e-06, 'sampling/importance_sampling_ratio/mean': 0.999940037727356, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8444746807217598, 'clip_ratio/low_mean': 7.980174223121139e-05, 'clip_ratio/low_min': 2.6713308216130827e-05, 'clip_ratio/high_mean': 4.791600815678976e-06, 'clip_ratio/high_max': 1.5341902098953142e-05, 'clip_ratio/region_mean': 8.459334412691533e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 482/1024 [22:03:57<25:31:21, 169.52s/it][AINFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 483/1024 [22:06:33<24:52:27, 165.52s/it][A
+                                                         [A{'loss': 0.0548, 'grad_norm': 0.0025550283025950193, 'learning_rate': 1e-05, 'num_tokens': 425709212.0, 'completions/mean_length': 6146.2109375, 'completions/min_length': 812.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6065.5986328125, 'completions/min_terminated_length': 812.0, 'completions/max_terminated_length': 14716.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019193854182958603, 'sampling/sampling_logp_difference/max': 7.281134128570557, 'sampling/importance_sampling_ratio/min': 0.0006884043687023222, 'sampling/importance_sampling_ratio/mean': 1.0000015497207642, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8365580290555954, 'clip_ratio/low_mean': 1.55851120666739e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.55851120666739e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 483/1024 [22:06:33<24:52:27, 165.52s/it][AINFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 484/1024 [22:09:25<25:07:49, 167.54s/it][A
+                                                         [A{'loss': 0.0698, 'grad_norm': 0.005126865580677986, 'learning_rate': 1e-05, 'num_tokens': 426566462.0, 'completions/mean_length': 6557.578125, 'completions/min_length': 437.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6321.744140625, 'completions/min_terminated_length': 437.0, 'completions/max_terminated_length': 16153.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.27852246165275574, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01839536987245083, 'sampling/sampling_logp_difference/max': 10.499993324279785, 'sampling/importance_sampling_ratio/min': 2.7536634661373682e-05, 'sampling/importance_sampling_ratio/mean': 0.9999485015869141, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8316832035779953, 'clip_ratio/low_mean': 4.780410063176532e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.036488455014478e-06, 'clip_ratio/high_max': 2.4752349872869672e-05, 'clip_ratio/region_mean': 5.484058920046664e-05, 'epoch': 0.45}
+
+ 47%|████▋     | 484/1024 [22:09:25<25:07:49, 167.54s/it][AINFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 485/1024 [22:12:07<24:47:59, 165.64s/it][A
+                                                         [A{'loss': 0.0753, 'grad_norm': 0.004829525947570801, 'learning_rate': 1e-05, 'num_tokens': 427480007.0, 'completions/mean_length': 7007.3203125, 'completions/min_length': 504.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6858.484375, 'completions/min_terminated_length': 504.0, 'completions/max_terminated_length': 16359.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3874102830886841, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019586069509387016, 'sampling/sampling_logp_difference/max': 8.508722305297852, 'sampling/importance_sampling_ratio/min': 0.00020170137577224523, 'sampling/importance_sampling_ratio/mean': 0.9998922944068909, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8674142584204674, 'clip_ratio/low_mean': 5.915772453590762e-05, 'clip_ratio/low_min': 1.7084812043322017e-05, 'clip_ratio/high_mean': 8.608928624198597e-06, 'clip_ratio/high_max': 3.443571449679439e-05, 'clip_ratio/region_mean': 6.776665304641938e-05, 'epoch': 0.45}
+
+ 47%|████▋     | 485/1024 [22:12:07<24:47:59, 165.64s/it][AINFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 486/1024 [22:14:53<24:46:50, 165.82s/it][A
+                                                         [A{'loss': 0.0687, 'grad_norm': 0.003539952216669917, 'learning_rate': 1e-05, 'num_tokens': 428404968.0, 'completions/mean_length': 7069.8828125, 'completions/min_length': 421.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6922.0400390625, 'completions/min_terminated_length': 421.0, 'completions/max_terminated_length': 14748.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3618982434272766, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020427238196134567, 'sampling/sampling_logp_difference/max': 8.332671165466309, 'sampling/importance_sampling_ratio/min': 0.00024052867956925184, 'sampling/importance_sampling_ratio/mean': 0.9999353885650635, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9066255167126656, 'clip_ratio/low_mean': 5.539863354897534e-05, 'clip_ratio/low_min': 8.211341992137022e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.539863354897534e-05, 'epoch': 0.45}
+
+ 47%|████▋     | 486/1024 [22:14:53<24:46:50, 165.82s/it][AINFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 487/1024 [22:17:20<23:54:46, 160.31s/it][A
+                                                         [A{'loss': 0.066, 'grad_norm': 0.0030504625756293535, 'learning_rate': 1e-05, 'num_tokens': 429137176.0, 'completions/mean_length': 5586.6875, 'completions/min_length': 602.0, 'completions/max_length': 15290.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5586.6875, 'completions/min_terminated_length': 602.0, 'completions/max_terminated_length': 15290.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.3480040729045868, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019396595656871796, 'sampling/sampling_logp_difference/max': 7.50585412979126, 'sampling/importance_sampling_ratio/min': 0.0005498559912666678, 'sampling/importance_sampling_ratio/mean': 0.9999984502792358, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9208655655384064, 'clip_ratio/low_mean': 5.576918465521885e-05, 'clip_ratio/low_min': 1.2613936178240692e-05, 'clip_ratio/high_mean': 4.137623932365386e-06, 'clip_ratio/high_max': 1.6550495729461545e-05, 'clip_ratio/region_mean': 5.99068093833921e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 487/1024 [22:17:20<23:54:46, 160.31s/it][AINFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 488/1024 [22:19:48<23:18:44, 156.58s/it][A
+                                                         [A{'loss': -0.0077, 'grad_norm': 0.003902251599356532, 'learning_rate': 1e-05, 'num_tokens': 429836026.0, 'completions/mean_length': 5266.265625, 'completions/min_length': 492.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4999.4404296875, 'completions/min_terminated_length': 492.0, 'completions/max_terminated_length': 15404.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01770034246146679, 'sampling/sampling_logp_difference/max': 2.868990898132324, 'sampling/importance_sampling_ratio/min': 0.05675617232918739, 'sampling/importance_sampling_ratio/mean': 0.9999457001686096, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7884859293699265, 'clip_ratio/low_mean': 3.6384140912559815e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.440377428087231e-06, 'clip_ratio/high_max': 3.3761509712348925e-05, 'clip_ratio/region_mean': 4.482451868170756e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 488/1024 [22:19:48<23:18:44, 156.58s/it][AINFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 489/1024 [22:22:19<23:00:35, 154.83s/it][A
+                                                         [A{'loss': 0.0508, 'grad_norm': 0.0024998660665005445, 'learning_rate': 1e-05, 'num_tokens': 430673446.0, 'completions/mean_length': 6398.53125, 'completions/min_length': 699.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6319.9052734375, 'completions/min_terminated_length': 699.0, 'completions/max_terminated_length': 15754.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.31929677724838257, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020521972328424454, 'sampling/sampling_logp_difference/max': 7.397497177124023, 'sampling/importance_sampling_ratio/min': 0.000612784584518522, 'sampling/importance_sampling_ratio/mean': 0.9999797940254211, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8982341960072517, 'clip_ratio/low_mean': 4.0199149452746497e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.580925744652632e-06, 'clip_ratio/high_max': 2.2323702978610527e-05, 'clip_ratio/region_mean': 4.578007497002545e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 489/1024 [22:22:19<23:00:35, 154.83s/it][AINFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 490/1024 [22:24:45<22:35:06, 152.26s/it][A
+                                                         [A{'loss': 0.0798, 'grad_norm': 0.00784115307033062, 'learning_rate': 1e-05, 'num_tokens': 431497546.0, 'completions/mean_length': 6277.65625, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6198.07861328125, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 14374.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.37716054916381836, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01836184598505497, 'sampling/sampling_logp_difference/max': 7.37491512298584, 'sampling/importance_sampling_ratio/min': 0.0006267798598855734, 'sampling/importance_sampling_ratio/mean': 0.999848484992981, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8139145970344543, 'clip_ratio/low_mean': 8.124458963720826e-05, 'clip_ratio/low_min': 1.2379174222587608e-05, 'clip_ratio/high_mean': 7.939156091651967e-06, 'clip_ratio/high_max': 3.1756624366607866e-05, 'clip_ratio/region_mean': 8.91837471499457e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 490/1024 [22:24:45<22:35:06, 152.26s/it][AINFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 491/1024 [22:27:42<23:36:41, 159.48s/it][A
+                                                         [A{'loss': 0.035, 'grad_norm': 0.004277343396097422, 'learning_rate': 1e-05, 'num_tokens': 432503414.0, 'completions/mean_length': 7708.59375, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7355.9345703125, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15903.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02224145457148552, 'sampling/sampling_logp_difference/max': 11.315095901489258, 'sampling/importance_sampling_ratio/min': 1.2187546417408157e-05, 'sampling/importance_sampling_ratio/mean': 0.9999503493309021, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.087083138525486, 'clip_ratio/low_mean': 2.3825880248296016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2187512058735592e-06, 'clip_ratio/high_max': 8.875004823494237e-06, 'clip_ratio/region_mean': 2.6044631454169576e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 491/1024 [22:27:42<23:36:41, 159.48s/it][AINFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 492/1024 [22:30:38<24:17:48, 164.41s/it][A
+                                                         [A{'loss': 0.0459, 'grad_norm': 0.006278311368077993, 'learning_rate': 1e-05, 'num_tokens': 433439137.0, 'completions/mean_length': 7162.7109375, 'completions/min_length': 842.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6865.25, 'completions/min_terminated_length': 842.0, 'completions/max_terminated_length': 15576.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2227931171655655, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02123419940471649, 'sampling/sampling_logp_difference/max': 7.499768257141113, 'sampling/importance_sampling_ratio/min': 0.0005532125360332429, 'sampling/importance_sampling_ratio/mean': 0.999966561794281, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9157010763883591, 'clip_ratio/low_mean': 3.561227788395627e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5960163182171527e-06, 'clip_ratio/high_max': 6.384065272868611e-06, 'clip_ratio/region_mean': 3.720829374742607e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 492/1024 [22:30:38<24:17:48, 164.41s/it][AINFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 493/1024 [22:33:20<24:10:55, 163.95s/it][A
+                                                         [A{'loss': 0.0556, 'grad_norm': 0.005177734419703484, 'learning_rate': 1e-05, 'num_tokens': 434402045.0, 'completions/mean_length': 7388.90625, 'completions/min_length': 980.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7023.251953125, 'completions/min_terminated_length': 980.0, 'completions/max_terminated_length': 16123.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.37951958179473877, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01827731542289257, 'sampling/sampling_logp_difference/max': 6.096303939819336, 'sampling/importance_sampling_ratio/min': 0.0022511729039251804, 'sampling/importance_sampling_ratio/mean': 0.9999250769615173, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7670486867427826, 'clip_ratio/low_mean': 5.1716241614485625e-05, 'clip_ratio/low_min': 3.601579010137357e-06, 'clip_ratio/high_mean': 8.656040449750435e-06, 'clip_ratio/high_max': 2.846911434062349e-05, 'clip_ratio/region_mean': 6.037228104105452e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 493/1024 [22:33:20<24:10:55, 163.95s/it][AINFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 494/1024 [22:36:15<24:36:02, 167.10s/it][A
+                                                         [A{'loss': 0.0662, 'grad_norm': 0.0032320048194378614, 'learning_rate': 1e-05, 'num_tokens': 435292029.0, 'completions/mean_length': 6805.375, 'completions/min_length': 587.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6496.38671875, 'completions/min_terminated_length': 587.0, 'completions/max_terminated_length': 15767.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018761277198791504, 'sampling/sampling_logp_difference/max': 9.613814353942871, 'sampling/importance_sampling_ratio/min': 6.679954094579443e-05, 'sampling/importance_sampling_ratio/mean': 0.9999642372131348, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8407405763864517, 'clip_ratio/low_mean': 7.719641234871233e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.257203722270788e-06, 'clip_ratio/high_max': 2.1548471977439476e-05, 'clip_ratio/region_mean': 8.345361538886209e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 494/1024 [22:36:15<24:36:02, 167.10s/it][AINFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 495/1024 [22:38:54<24:12:53, 164.79s/it][A
+                                                         [A{'loss': 0.0279, 'grad_norm': 0.0030854379292577505, 'learning_rate': 1e-05, 'num_tokens': 436046842.0, 'completions/mean_length': 5753.4140625, 'completions/min_length': 946.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5321.2763671875, 'completions/min_terminated_length': 946.0, 'completions/max_terminated_length': 15105.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.31405961513519287, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017733070999383926, 'sampling/sampling_logp_difference/max': 19.24954605102539, 'sampling/importance_sampling_ratio/min': 4.36544311810394e-09, 'sampling/importance_sampling_ratio/mean': 0.9998626708984375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7848984077572823, 'clip_ratio/low_mean': 7.76378024056612e-05, 'clip_ratio/low_min': 1.7026316072588088e-05, 'clip_ratio/high_mean': 8.65123752191721e-07, 'clip_ratio/high_max': 3.460495008766884e-06, 'clip_ratio/region_mean': 7.850292649891344e-05, 'epoch': 0.46}
+
+ 48%|████▊     | 495/1024 [22:38:54<24:12:53, 164.79s/it][AINFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 496/1024 [22:41:33<23:54:03, 162.96s/it][A
+                                                         [A{'loss': 0.0805, 'grad_norm': 0.003124243812635541, 'learning_rate': 1e-05, 'num_tokens': 436899638.0, 'completions/mean_length': 6522.84375, 'completions/min_length': 1062.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6445.19677734375, 'completions/min_terminated_length': 1062.0, 'completions/max_terminated_length': 15682.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2706219553947449, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021180003881454468, 'sampling/sampling_logp_difference/max': 12.316575050354004, 'sampling/importance_sampling_ratio/min': 4.476920821616659e-06, 'sampling/importance_sampling_ratio/mean': 0.9999418258666992, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0593653172254562, 'clip_ratio/low_mean': 3.234025916754035e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.301897092773288e-06, 'clip_ratio/high_max': 1.7207588371093152e-05, 'clip_ratio/region_mean': 3.664215591925313e-05, 'epoch': 0.46}
+
+ 48%|████▊     | 496/1024 [22:41:33<23:54:03, 162.96s/it][AINFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▊     | 497/1024 [22:44:12<23:40:16, 161.70s/it][A
+                                                         [A{'loss': 0.0817, 'grad_norm': 0.005001795012503862, 'learning_rate': 1e-05, 'num_tokens': 437713008.0, 'completions/mean_length': 6203.203125, 'completions/min_length': 1017.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5874.7900390625, 'completions/min_terminated_length': 1017.0, 'completions/max_terminated_length': 14515.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.26143795251846313, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017751028761267662, 'sampling/sampling_logp_difference/max': 6.34374475479126, 'sampling/importance_sampling_ratio/min': 0.001757707679644227, 'sampling/importance_sampling_ratio/mean': 0.9999101758003235, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8152795508503914, 'clip_ratio/low_mean': 2.8437304308681632e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9476084932866797e-06, 'clip_ratio/high_max': 1.1790433973146719e-05, 'clip_ratio/region_mean': 3.138491274512489e-05, 'epoch': 0.46}
+
+ 49%|████▊     | 497/1024 [22:44:12<23:40:16, 161.70s/it][AINFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▊     | 498/1024 [22:46:41<23:05:51, 158.08s/it][A
+                                                         [A{'loss': 0.0759, 'grad_norm': 0.005084732081741095, 'learning_rate': 1e-05, 'num_tokens': 438495811.0, 'completions/mean_length': 5975.5234375, 'completions/min_length': 690.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5725.72021484375, 'completions/min_terminated_length': 690.0, 'completions/max_terminated_length': 15423.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018671832978725433, 'sampling/sampling_logp_difference/max': 10.374839782714844, 'sampling/importance_sampling_ratio/min': 3.120788460364565e-05, 'sampling/importance_sampling_ratio/mean': 0.9998699426651001, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8275932744145393, 'clip_ratio/low_mean': 4.4599403963729856e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.229499381835922e-06, 'clip_ratio/high_max': 1.3163793028070359e-05, 'clip_ratio/region_mean': 4.882890357293945e-05, 'epoch': 0.46}
+
+ 49%|████▊     | 498/1024 [22:46:41<23:05:51, 158.08s/it][AINFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▊     | 499/1024 [22:49:11<22:41:04, 155.55s/it][A
+                                                         [A{'loss': 0.0282, 'grad_norm': 0.002567912917584181, 'learning_rate': 1e-05, 'num_tokens': 439413055.0, 'completions/mean_length': 7019.59375, 'completions/min_length': 1058.0, 'completions/max_length': 16110.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7019.59375, 'completions/min_terminated_length': 1058.0, 'completions/max_terminated_length': 16110.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02012534812092781, 'sampling/sampling_logp_difference/max': 6.876677513122559, 'sampling/importance_sampling_ratio/min': 0.0010315657127648592, 'sampling/importance_sampling_ratio/mean': 1.0000476837158203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9266618490219116, 'clip_ratio/low_mean': 3.0413870263146237e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.07410060588154e-07, 'clip_ratio/high_max': 3.229640242352616e-06, 'clip_ratio/region_mean': 3.1221280551108066e-05, 'epoch': 0.46}
+
+ 49%|████▊     | 499/1024 [22:49:11<22:41:04, 155.55s/it][AINFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▉     | 500/1024 [22:52:01<23:17:03, 159.97s/it][A
+                                                         [A{'loss': 0.0617, 'grad_norm': 0.004862098954617977, 'learning_rate': 1e-05, 'num_tokens': 440375128.0, 'completions/mean_length': 7373.3203125, 'completions/min_length': 854.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7082.65283203125, 'completions/min_terminated_length': 854.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020596595481038094, 'sampling/sampling_logp_difference/max': 7.28115701675415, 'sampling/importance_sampling_ratio/min': 0.0006883886526338756, 'sampling/importance_sampling_ratio/mean': 0.9999188780784607, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9383682310581207, 'clip_ratio/low_mean': 4.08189575864526e-05, 'clip_ratio/low_min': 4.041122338094283e-06, 'clip_ratio/high_mean': 4.5819448359907256e-06, 'clip_ratio/high_max': 1.8327779343962902e-05, 'clip_ratio/region_mean': 4.5400901854009135e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 500/1024 [22:52:01<23:17:03, 159.97s/it][AINFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▉     | 501/1024 [22:54:38<23:05:06, 158.90s/it][A
+                                                         [A{'loss': 0.0316, 'grad_norm': 0.003041388699784875, 'learning_rate': 1e-05, 'num_tokens': 441156306.0, 'completions/mean_length': 5944.953125, 'completions/min_length': 330.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5862.755859375, 'completions/min_terminated_length': 330.0, 'completions/max_terminated_length': 16280.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019817989319562912, 'sampling/sampling_logp_difference/max': 7.171038627624512, 'sampling/importance_sampling_ratio/min': 0.0007685241289436817, 'sampling/importance_sampling_ratio/mean': 0.9999566078186035, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9130716845393181, 'clip_ratio/low_mean': 6.364750265674957e-05, 'clip_ratio/low_min': 3.94595599573222e-06, 'clip_ratio/high_mean': 4.12654787851352e-06, 'clip_ratio/high_max': 1.650619151405408e-05, 'clip_ratio/region_mean': 6.77740499668289e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 501/1024 [22:54:38<23:05:06, 158.90s/it][AINFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▉     | 502/1024 [22:57:32<23:42:47, 163.54s/it][A
+                                                         [A{'loss': 0.0306, 'grad_norm': 0.005679543130099773, 'learning_rate': 1e-05, 'num_tokens': 442032972.0, 'completions/mean_length': 6686.015625, 'completions/min_length': 1018.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6609.6533203125, 'completions/min_terminated_length': 1018.0, 'completions/max_terminated_length': 16181.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.24988999962806702, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019738182425498962, 'sampling/sampling_logp_difference/max': 4.86245584487915, 'sampling/importance_sampling_ratio/min': 0.007731473073363304, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8640913739800453, 'clip_ratio/low_mean': 3.147234815514821e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.205811807078135e-06, 'clip_ratio/high_max': 2.9951792839710834e-05, 'clip_ratio/region_mean': 4.0678160075913183e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 502/1024 [22:57:32<23:42:47, 163.54s/it][AINFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▉     | 503/1024 [23:00:37<24:34:44, 169.84s/it][A
+                                                         [A{'loss': 0.0756, 'grad_norm': 0.006176612339913845, 'learning_rate': 1e-05, 'num_tokens': 442940940.0, 'completions/mean_length': 6945.5, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6231.6640625, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 15951.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.29644322395324707, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01836501806974411, 'sampling/sampling_logp_difference/max': 8.607227325439453, 'sampling/importance_sampling_ratio/min': 0.00018278000061400235, 'sampling/importance_sampling_ratio/mean': 0.9999117851257324, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8156519457697868, 'clip_ratio/low_mean': 3.858067566397949e-05, 'clip_ratio/low_min': 9.290916750614997e-06, 'clip_ratio/high_mean': 7.5476494316717435e-06, 'clip_ratio/high_max': 3.0190597726686974e-05, 'clip_ratio/region_mean': 4.612832617567619e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 503/1024 [23:00:37<24:34:44, 169.84s/it][AINFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▉     | 504/1024 [23:03:54<25:43:39, 178.11s/it][A
+                                                         [A{'loss': 0.0386, 'grad_norm': 0.0021770994644612074, 'learning_rate': 1e-05, 'num_tokens': 443992041.0, 'completions/mean_length': 8068.5390625, 'completions/min_length': 875.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7363.8388671875, 'completions/min_terminated_length': 875.0, 'completions/max_terminated_length': 15847.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019003838300704956, 'sampling/sampling_logp_difference/max': 8.624998092651367, 'sampling/importance_sampling_ratio/min': 0.0001795605494407937, 'sampling/importance_sampling_ratio/mean': 0.9999759197235107, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8196670189499855, 'clip_ratio/low_mean': 3.060894187001395e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.28071654773521e-06, 'clip_ratio/high_max': 2.2105000425653998e-05, 'clip_ratio/region_mean': 3.6889658531436e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 504/1024 [23:03:54<25:43:39, 178.11s/it][AINFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▉     | 505/1024 [23:06:29<24:41:10, 171.23s/it][A
+                                                         [A{'loss': 0.063, 'grad_norm': 0.00788798462599516, 'learning_rate': 1e-05, 'num_tokens': 444679675.0, 'completions/mean_length': 5209.140625, 'completions/min_length': 136.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5031.76220703125, 'completions/min_terminated_length': 136.0, 'completions/max_terminated_length': 15168.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.33220988512039185, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018808994442224503, 'sampling/sampling_logp_difference/max': 8.267484664916992, 'sampling/importance_sampling_ratio/min': 0.00025673024356365204, 'sampling/importance_sampling_ratio/mean': 0.9999796748161316, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8851845487952232, 'clip_ratio/low_mean': 4.5685408849749365e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2181025062527624e-06, 'clip_ratio/high_max': 1.287241002501105e-05, 'clip_ratio/region_mean': 4.89035115833758e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 505/1024 [23:06:29<24:41:10, 171.23s/it][AINFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▉     | 506/1024 [23:09:26<24:53:12, 172.96s/it][A
+                                                         [A{'loss': 0.1586, 'grad_norm': 0.004547314252704382, 'learning_rate': 1e-05, 'num_tokens': 445668126.0, 'completions/mean_length': 7558.8984375, 'completions/min_length': 707.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7274.21728515625, 'completions/min_terminated_length': 707.0, 'completions/max_terminated_length': 16259.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.42293959856033325, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.02099413052201271, 'sampling/sampling_logp_difference/max': 9.059958457946777, 'sampling/importance_sampling_ratio/min': 0.00011622780584730208, 'sampling/importance_sampling_ratio/mean': 0.999848484992981, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.003449946641922, 'clip_ratio/low_mean': 5.944662643742049e-05, 'clip_ratio/low_min': 8.106994755507912e-06, 'clip_ratio/high_mean': 6.590465602585027e-06, 'clip_ratio/high_max': 2.294301202709903e-05, 'clip_ratio/region_mean': 6.603709243790945e-05, 'epoch': 0.47}
+
+ 49%|████▉     | 506/1024 [23:09:26<24:53:12, 172.96s/it][AINFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|████▉     | 507/1024 [23:12:21<24:55:19, 173.54s/it][A
+                                                         [A{'loss': 0.121, 'grad_norm': 0.004621773958206177, 'learning_rate': 1e-05, 'num_tokens': 446464587.0, 'completions/mean_length': 6066.6015625, 'completions/min_length': 1107.0, 'completions/max_length': 16137.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6066.6015625, 'completions/min_terminated_length': 1107.0, 'completions/max_terminated_length': 16137.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3537652790546417, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018016980960965157, 'sampling/sampling_logp_difference/max': 11.179987907409668, 'sampling/importance_sampling_ratio/min': 1.3950601896794979e-05, 'sampling/importance_sampling_ratio/mean': 1.0000154972076416, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8450648710131645, 'clip_ratio/low_mean': 8.880347786544007e-05, 'clip_ratio/low_min': 9.06585455595632e-06, 'clip_ratio/high_mean': 6.047981628398702e-06, 'clip_ratio/high_max': 2.1350435872591333e-05, 'clip_ratio/region_mean': 9.485145938015194e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 507/1024 [23:12:21<24:55:19, 173.54s/it][AINFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|████▉     | 508/1024 [23:15:04<24:26:00, 170.47s/it][A
+                                                         [A{'loss': 0.0396, 'grad_norm': 0.004523546434938908, 'learning_rate': 1e-05, 'num_tokens': 447381134.0, 'completions/mean_length': 6988.0234375, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6838.88134765625, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.22567617893218994, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021812722086906433, 'sampling/sampling_logp_difference/max': 4.124781131744385, 'sampling/importance_sampling_ratio/min': 0.016167031601071358, 'sampling/importance_sampling_ratio/mean': 0.9999901056289673, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0452716201543808, 'clip_ratio/low_mean': 2.149350007130124e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.633681207153131e-07, 'clip_ratio/high_max': 3.0534724828612525e-06, 'clip_ratio/region_mean': 2.2256868305703392e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 508/1024 [23:15:04<24:26:00, 170.47s/it][AINFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|████▉     | 509/1024 [23:17:09<22:24:40, 156.66s/it][A
+                                                         [A{'loss': 0.0188, 'grad_norm': 0.004002885892987251, 'learning_rate': 1e-05, 'num_tokens': 448158014.0, 'completions/mean_length': 5948.5, 'completions/min_length': 1252.0, 'completions/max_length': 12316.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5948.5, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 12316.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.3124620020389557, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018487900495529175, 'sampling/sampling_logp_difference/max': 7.062494277954102, 'sampling/importance_sampling_ratio/min': 0.0008566387114115059, 'sampling/importance_sampling_ratio/mean': 0.9999228715896606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8241566568613052, 'clip_ratio/low_mean': 3.684896307731833e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3968978009870625e-06, 'clip_ratio/high_max': 5.58759120394825e-06, 'clip_ratio/region_mean': 3.824586099199223e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 509/1024 [23:17:09<22:24:40, 156.66s/it][AINFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|████▉     | 510/1024 [23:20:04<23:08:43, 162.11s/it][A
+                                                         [A{'loss': 0.0787, 'grad_norm': 0.0019062751671299338, 'learning_rate': 1e-05, 'num_tokens': 449197054.0, 'completions/mean_length': 7966.375, 'completions/min_length': 660.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7764.3525390625, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 16044.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020015282556414604, 'sampling/sampling_logp_difference/max': 8.731462478637695, 'sampling/importance_sampling_ratio/min': 0.0001614262000657618, 'sampling/importance_sampling_ratio/mean': 0.9999173879623413, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8868448063731194, 'clip_ratio/low_mean': 3.973086239739132e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.439610338773491e-06, 'clip_ratio/high_max': 1.0490723752809572e-05, 'clip_ratio/region_mean': 4.3170473020381905e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 510/1024 [23:20:04<23:08:43, 162.11s/it][AINFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|████▉     | 511/1024 [23:22:36<22:41:53, 159.29s/it][A
+                                                         [A{'loss': 0.0547, 'grad_norm': 0.00490277074277401, 'learning_rate': 1e-05, 'num_tokens': 450050153.0, 'completions/mean_length': 6520.0234375, 'completions/min_length': 461.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6442.3544921875, 'completions/min_terminated_length': 461.0, 'completions/max_terminated_length': 16124.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3437528908252716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020178331062197685, 'sampling/sampling_logp_difference/max': 12.324441909790039, 'sampling/importance_sampling_ratio/min': 4.4418397919798736e-06, 'sampling/importance_sampling_ratio/mean': 0.9998800754547119, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9168323278427124, 'clip_ratio/low_mean': 3.558348203114292e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0487764206554857e-06, 'clip_ratio/high_max': 1.2195105682621943e-05, 'clip_ratio/region_mean': 3.8632259474979946e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 511/1024 [23:22:36<22:41:53, 159.29s/it][AINFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 512/1024 [23:25:44<23:52:32, 167.88s/it][A
+                                                         [A{'loss': -0.0023, 'grad_norm': 0.003792276605963707, 'learning_rate': 1e-05, 'num_tokens': 450915281.0, 'completions/mean_length': 6614.5625, 'completions/min_length': 429.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6217.4306640625, 'completions/min_terminated_length': 429.0, 'completions/max_terminated_length': 16252.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.20069602131843567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019233014434576035, 'sampling/sampling_logp_difference/max': 5.40609884262085, 'sampling/importance_sampling_ratio/min': 0.004489119164645672, 'sampling/importance_sampling_ratio/mean': 0.9999154806137085, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8635925352573395, 'clip_ratio/low_mean': 3.363800146871654e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.989432121263235e-06, 'clip_ratio/high_max': 7.95772848505294e-06, 'clip_ratio/region_mean': 3.562743381735345e-05, 'epoch': 0.47}
+
+ 50%|█████     | 512/1024 [23:25:44<23:52:32, 167.88s/it][AINFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 513/1024 [23:28:38<24:05:47, 169.76s/it][A
+                                                         [A{'loss': 0.0287, 'grad_norm': 0.0031763892620801926, 'learning_rate': 1e-05, 'num_tokens': 451761322.0, 'completions/mean_length': 6458.5078125, 'completions/min_length': 1025.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5970.36865234375, 'completions/min_terminated_length': 1025.0, 'completions/max_terminated_length': 16206.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.282474160194397, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01935420371592045, 'sampling/sampling_logp_difference/max': 9.24997615814209, 'sampling/importance_sampling_ratio/min': 9.611394489184022e-05, 'sampling/importance_sampling_ratio/mean': 0.9999036192893982, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8816124573349953, 'clip_ratio/low_mean': 3.4846169796765025e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.67555605105008e-06, 'clip_ratio/high_max': 1.6306271390931215e-05, 'clip_ratio/region_mean': 4.1521726302562456e-05, 'epoch': 0.47}
+
+ 50%|█████     | 513/1024 [23:28:38<24:05:47, 169.76s/it][AINFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 514/1024 [23:31:14<23:26:07, 165.43s/it][A
+                                                         [A{'loss': 0.1094, 'grad_norm': 0.004134794697165489, 'learning_rate': 1e-05, 'num_tokens': 452526342.0, 'completions/mean_length': 5844.03125, 'completions/min_length': 237.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5676.73046875, 'completions/min_terminated_length': 237.0, 'completions/max_terminated_length': 15928.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.28930899500846863, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02013866975903511, 'sampling/sampling_logp_difference/max': 8.951433181762695, 'sampling/importance_sampling_ratio/min': 0.00012955136480741203, 'sampling/importance_sampling_ratio/mean': 0.9999297857284546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9008020162582397, 'clip_ratio/low_mean': 2.2518463538290234e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0314158721012063e-06, 'clip_ratio/high_max': 7.861634912842419e-06, 'clip_ratio/region_mean': 2.554987941039144e-05, 'epoch': 0.47}
+
+ 50%|█████     | 514/1024 [23:31:14<23:26:07, 165.43s/it][AINFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 515/1024 [23:33:57<23:18:56, 164.90s/it][A
+                                                         [A{'loss': 0.0193, 'grad_norm': 0.0022520655766129494, 'learning_rate': 1e-05, 'num_tokens': 453343385.0, 'completions/mean_length': 6214.5859375, 'completions/min_length': 1096.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6134.51171875, 'completions/min_terminated_length': 1096.0, 'completions/max_terminated_length': 16180.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.20623260736465454, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019947605207562447, 'sampling/sampling_logp_difference/max': 10.187482833862305, 'sampling/importance_sampling_ratio/min': 3.763851054827683e-05, 'sampling/importance_sampling_ratio/mean': 0.9999879598617554, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9522949978709221, 'clip_ratio/low_mean': 2.444096298859222e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.177790176778217e-06, 'clip_ratio/high_max': 1.2711160707112867e-05, 'clip_ratio/region_mean': 2.761875293799676e-05, 'epoch': 0.47}
+
+ 50%|█████     | 515/1024 [23:33:57<23:18:56, 164.90s/it][AINFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 516/1024 [23:36:44<23:21:12, 165.50s/it][A
+                                                         [A{'loss': 0.0609, 'grad_norm': 0.004887089133262634, 'learning_rate': 1e-05, 'num_tokens': 454275379.0, 'completions/mean_length': 7138.515625, 'completions/min_length': 846.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7065.71630859375, 'completions/min_terminated_length': 846.0, 'completions/max_terminated_length': 14376.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.32035762071609497, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019449077546596527, 'sampling/sampling_logp_difference/max': 5.312184810638428, 'sampling/importance_sampling_ratio/min': 0.004931141622364521, 'sampling/importance_sampling_ratio/mean': 0.9999544620513916, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8856206461787224, 'clip_ratio/low_mean': 3.371703428456385e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.431061753009999e-05, 'clip_ratio/high_max': 5.724247012039996e-05, 'clip_ratio/region_mean': 4.8027652155724354e-05, 'epoch': 0.47}
+
+ 50%|█████     | 516/1024 [23:36:44<23:21:12, 165.50s/it][AINFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|█████     | 517/1024 [23:39:19<22:50:46, 162.22s/it][A
+                                                         [A{'loss': 0.0366, 'grad_norm': 0.003875041613355279, 'learning_rate': 1e-05, 'num_tokens': 455076625.0, 'completions/mean_length': 6077.796875, 'completions/min_length': 954.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5915.00830078125, 'completions/min_terminated_length': 954.0, 'completions/max_terminated_length': 15855.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.23933593928813934, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018907926976680756, 'sampling/sampling_logp_difference/max': 10.31219482421875, 'sampling/importance_sampling_ratio/min': 3.322543852846138e-05, 'sampling/importance_sampling_ratio/mean': 1.0000392198562622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.862022191286087, 'clip_ratio/low_mean': 4.936055870530254e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9019220139380195e-06, 'clip_ratio/high_max': 1.5607688055752078e-05, 'clip_ratio/region_mean': 5.326248106030107e-05, 'epoch': 0.48}
+
+ 50%|█████     | 517/1024 [23:39:19<22:50:46, 162.22s/it][AINFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 518/1024 [23:41:40<21:55:15, 155.96s/it][A
+                                                         [A{'loss': 0.0822, 'grad_norm': 0.004288897849619389, 'learning_rate': 1e-05, 'num_tokens': 455889693.0, 'completions/mean_length': 6211.65625, 'completions/min_length': 1292.0, 'completions/max_length': 15316.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6211.65625, 'completions/min_terminated_length': 1292.0, 'completions/max_terminated_length': 15316.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.27145031094551086, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01986120268702507, 'sampling/sampling_logp_difference/max': 12.874927520751953, 'sampling/importance_sampling_ratio/min': 2.5614745027269237e-06, 'sampling/importance_sampling_ratio/mean': 0.9999270439147949, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8835236355662346, 'clip_ratio/low_mean': 3.7409978290270374e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.535163386914064e-06, 'clip_ratio/high_max': 1.0557040241110371e-05, 'clip_ratio/region_mean': 4.0945141790871276e-05, 'epoch': 0.48}
+
+ 51%|█████     | 518/1024 [23:41:40<21:55:15, 155.96s/it][AINFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 519/1024 [23:44:24<22:13:33, 158.44s/it][A
+                                                         [A{'loss': 0.0311, 'grad_norm': 0.004230308346450329, 'learning_rate': 1e-05, 'num_tokens': 456809643.0, 'completions/mean_length': 7035.609375, 'completions/min_length': 762.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6962.0, 'completions/min_terminated_length': 762.0, 'completions/max_terminated_length': 16128.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.17282497882843018, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020262110978364944, 'sampling/sampling_logp_difference/max': 10.99984073638916, 'sampling/importance_sampling_ratio/min': 1.670435995038133e-05, 'sampling/importance_sampling_ratio/mean': 0.9999722242355347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9033957049250603, 'clip_ratio/low_mean': 3.578249538804812e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.105663826223463e-07, 'clip_ratio/high_max': 2.842265530489385e-06, 'clip_ratio/region_mean': 3.649306199804414e-05, 'epoch': 0.48}
+
+ 51%|█████     | 519/1024 [23:44:24<22:13:33, 158.44s/it][AINFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 520/1024 [23:47:07<22:21:16, 159.68s/it][A
+                                                         [A{'loss': 0.0204, 'grad_norm': 0.0029154124204069376, 'learning_rate': 1e-05, 'num_tokens': 457669431.0, 'completions/mean_length': 6557.40625, 'completions/min_length': 1136.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6321.568359375, 'completions/min_terminated_length': 1136.0, 'completions/max_terminated_length': 16241.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019474683329463005, 'sampling/sampling_logp_difference/max': 9.746816635131836, 'sampling/importance_sampling_ratio/min': 5.8480534789850935e-05, 'sampling/importance_sampling_ratio/mean': 1.0000226497650146, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8352414071559906, 'clip_ratio/low_mean': 2.8534720058814855e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.844010264714598e-06, 'clip_ratio/high_max': 3.539844283295679e-05, 'clip_ratio/region_mean': 3.837873060774655e-05, 'epoch': 0.48}
+
+ 51%|█████     | 520/1024 [23:47:07<22:21:16, 159.68s/it][AINFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 521/1024 [23:49:54<22:36:20, 161.79s/it][A
+                                                         [A{'loss': 0.0248, 'grad_norm': 0.0025195449125021696, 'learning_rate': 1e-05, 'num_tokens': 458512648.0, 'completions/mean_length': 6444.1328125, 'completions/min_length': 398.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6205.576171875, 'completions/min_terminated_length': 398.0, 'completions/max_terminated_length': 15428.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01779567077755928, 'sampling/sampling_logp_difference/max': 10.624913215637207, 'sampling/importance_sampling_ratio/min': 2.4302940801135264e-05, 'sampling/importance_sampling_ratio/mean': 0.999996542930603, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7480100840330124, 'clip_ratio/low_mean': 5.166920755073079e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.917558859076962e-05, 'clip_ratio/high_max': 6.400114170901361e-05, 'clip_ratio/region_mean': 7.084479466357152e-05, 'epoch': 0.48}
+
+ 51%|█████     | 521/1024 [23:49:54<22:36:20, 161.79s/it][AINFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 522/1024 [23:52:43<22:51:47, 163.96s/it][A
+                                                         [A{'loss': 0.0608, 'grad_norm': 0.004339073318988085, 'learning_rate': 1e-05, 'num_tokens': 459377790.0, 'completions/mean_length': 6615.234375, 'completions/min_length': 105.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6380.7841796875, 'completions/min_terminated_length': 105.0, 'completions/max_terminated_length': 15868.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.31064465641975403, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018815383315086365, 'sampling/sampling_logp_difference/max': 7.76359748840332, 'sampling/importance_sampling_ratio/min': 0.00042492515058256686, 'sampling/importance_sampling_ratio/mean': 0.9999370574951172, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8428665772080421, 'clip_ratio/low_mean': 3.4855478702411347e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.87236081375886e-07, 'clip_ratio/high_max': 2.748944325503544e-06, 'clip_ratio/region_mean': 3.5542715181691165e-05, 'epoch': 0.48}
+
+ 51%|█████     | 522/1024 [23:52:43<22:51:47, 163.96s/it][AINFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 523/1024 [23:55:16<22:21:33, 160.67s/it][A
+                                                         [A{'loss': 0.0502, 'grad_norm': 0.005003004334867001, 'learning_rate': 1e-05, 'num_tokens': 460189823.0, 'completions/mean_length': 6200.3203125, 'completions/min_length': 1032.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5955.912109375, 'completions/min_terminated_length': 1032.0, 'completions/max_terminated_length': 15239.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0192951001226902, 'sampling/sampling_logp_difference/max': 5.2945051193237305, 'sampling/importance_sampling_ratio/min': 0.005019097588956356, 'sampling/importance_sampling_ratio/mean': 0.9999645948410034, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9044734612107277, 'clip_ratio/low_mean': 2.2591082483813807e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.496596083456097e-06, 'clip_ratio/high_max': 2.2513844896820956e-05, 'clip_ratio/region_mean': 3.0087678169365972e-05, 'epoch': 0.48}
+
+ 51%|█████     | 523/1024 [23:55:16<22:21:33, 160.67s/it][AINFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 524/1024 [23:57:34<21:24:07, 154.10s/it][A
+                                                         [A{'loss': 0.0209, 'grad_norm': 0.005491400603204966, 'learning_rate': 1e-05, 'num_tokens': 460944164.0, 'completions/mean_length': 5758.9140625, 'completions/min_length': 1181.0, 'completions/max_length': 15706.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5758.9140625, 'completions/min_terminated_length': 1181.0, 'completions/max_terminated_length': 15706.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.2330428510904312, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019315458834171295, 'sampling/sampling_logp_difference/max': 5.54492712020874, 'sampling/importance_sampling_ratio/min': 0.003907227888703346, 'sampling/importance_sampling_ratio/mean': 0.9999998807907104, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8783154934644699, 'clip_ratio/low_mean': 3.145246773783583e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.771700446326577e-06, 'clip_ratio/high_max': 1.9086801785306307e-05, 'clip_ratio/region_mean': 3.622416772941506e-05, 'epoch': 0.48}
+
+ 51%|█████     | 524/1024 [23:57:34<21:24:07, 154.10s/it][AINFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████▏    | 525/1024 [24:00:21<21:52:06, 157.77s/it][A
+                                                         [A{'loss': 0.0103, 'grad_norm': 0.0038622859865427017, 'learning_rate': 1e-05, 'num_tokens': 461931916.0, 'completions/mean_length': 7573.375, 'completions/min_length': 1579.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7504.0, 'completions/min_terminated_length': 1579.0, 'completions/max_terminated_length': 15536.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.14123955368995667, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02145528793334961, 'sampling/sampling_logp_difference/max': 6.1500749588012695, 'sampling/importance_sampling_ratio/min': 0.002133321948349476, 'sampling/importance_sampling_ratio/mean': 0.9999769926071167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.057753436267376, 'clip_ratio/low_mean': 9.616303373150004e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.888714672939386e-06, 'clip_ratio/high_max': 1.5554858691757545e-05, 'clip_ratio/region_mean': 1.3505018273463065e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 525/1024 [24:00:21<21:52:06, 157.77s/it][AINFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████▏    | 526/1024 [24:02:58<21:47:21, 157.51s/it][A
+                                                         [A{'loss': 0.0506, 'grad_norm': 0.002902502194046974, 'learning_rate': 1e-05, 'num_tokens': 462894701.0, 'completions/mean_length': 7353.0703125, 'completions/min_length': 907.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7136.328125, 'completions/min_terminated_length': 907.0, 'completions/max_terminated_length': 14553.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021296534687280655, 'sampling/sampling_logp_difference/max': 5.312461853027344, 'sampling/importance_sampling_ratio/min': 0.00492977537214756, 'sampling/importance_sampling_ratio/mean': 0.9999150037765503, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9386680871248245, 'clip_ratio/low_mean': 4.7102344296945375e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.324094329102081e-06, 'clip_ratio/high_max': 2.2185531634022482e-05, 'clip_ratio/region_mean': 5.342643908079481e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 526/1024 [24:02:58<21:47:21, 157.51s/it][AINFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████▏    | 527/1024 [24:05:41<21:58:00, 159.11s/it][A
+                                                         [A{'loss': 0.0546, 'grad_norm': 0.002602500608190894, 'learning_rate': 1e-05, 'num_tokens': 463849087.0, 'completions/mean_length': 7280.953125, 'completions/min_length': 1111.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6987.30615234375, 'completions/min_terminated_length': 1111.0, 'completions/max_terminated_length': 15851.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020630592480301857, 'sampling/sampling_logp_difference/max': 10.12484359741211, 'sampling/importance_sampling_ratio/min': 4.007156167062931e-05, 'sampling/importance_sampling_ratio/mean': 0.9999302625656128, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9424067437648773, 'clip_ratio/low_mean': 5.111583186589996e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.666198492486728e-06, 'clip_ratio/high_max': 1.8664793969946913e-05, 'clip_ratio/region_mean': 5.578203035838669e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 527/1024 [24:05:41<21:58:00, 159.11s/it][AINFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 528/1024 [24:07:59<21:03:43, 152.87s/it][A
+                                                         [A{'loss': 0.1494, 'grad_norm': 0.005743890535086393, 'learning_rate': 1e-05, 'num_tokens': 464704336.0, 'completions/mean_length': 6520.6328125, 'completions/min_length': 1459.0, 'completions/max_length': 14628.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6520.6328125, 'completions/min_terminated_length': 1459.0, 'completions/max_terminated_length': 14628.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3413938879966736, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018370801582932472, 'sampling/sampling_logp_difference/max': 9.74838638305664, 'sampling/importance_sampling_ratio/min': 5.838880315423012e-05, 'sampling/importance_sampling_ratio/mean': 0.9999988079071045, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8501213267445564, 'clip_ratio/low_mean': 4.5688502041230095e-05, 'clip_ratio/low_min': 5.72383623875794e-06, 'clip_ratio/high_mean': 1.0150766001970624e-05, 'clip_ratio/high_max': 3.77411461158772e-05, 'clip_ratio/region_mean': 5.583926849794807e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 528/1024 [24:07:59<21:03:43, 152.87s/it][AINFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 529/1024 [24:10:23<20:39:19, 150.22s/it][A
+                                                         [A{'loss': 0.0967, 'grad_norm': 0.004826955031603575, 'learning_rate': 1e-05, 'num_tokens': 465632152.0, 'completions/mean_length': 7111.0, 'completions/min_length': 1288.0, 'completions/max_length': 14675.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7111.0, 'completions/min_terminated_length': 1288.0, 'completions/max_terminated_length': 14675.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2975040376186371, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019976403564214706, 'sampling/sampling_logp_difference/max': 9.061508178710938, 'sampling/importance_sampling_ratio/min': 0.00011604782775975764, 'sampling/importance_sampling_ratio/mean': 0.9999524354934692, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8829544633626938, 'clip_ratio/low_mean': 2.1804387529300584e-05, 'clip_ratio/low_min': 3.918126822100021e-06, 'clip_ratio/high_mean': 2.287563575009699e-06, 'clip_ratio/high_max': 9.150254300038796e-06, 'clip_ratio/region_mean': 2.4091951559057634e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 529/1024 [24:10:23<20:39:19, 150.22s/it][AINFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 530/1024 [24:13:18<21:38:33, 157.72s/it][A
+                                                         [A{'loss': 0.0447, 'grad_norm': 0.0028944616205990314, 'learning_rate': 1e-05, 'num_tokens': 466648507.0, 'completions/mean_length': 7797.7109375, 'completions/min_length': 769.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7448.67431640625, 'completions/min_terminated_length': 769.0, 'completions/max_terminated_length': 15132.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020830729976296425, 'sampling/sampling_logp_difference/max': 8.25, 'sampling/importance_sampling_ratio/min': 0.0002612585376482457, 'sampling/importance_sampling_ratio/mean': 0.999991774559021, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9747610911726952, 'clip_ratio/low_mean': 4.392900382299558e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.603994390592561e-06, 'clip_ratio/high_max': 2.3185014015325578e-05, 'clip_ratio/region_mean': 5.153299889570917e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 530/1024 [24:13:18<21:38:33, 157.72s/it][AINFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 531/1024 [24:15:57<21:37:52, 157.96s/it][A
+                                                         [A{'loss': 0.0573, 'grad_norm': 0.003612271510064602, 'learning_rate': 1e-05, 'num_tokens': 467487976.0, 'completions/mean_length': 6395.4765625, 'completions/min_length': 227.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6316.82666015625, 'completions/min_terminated_length': 227.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01959329843521118, 'sampling/sampling_logp_difference/max': 13.624999046325684, 'sampling/importance_sampling_ratio/min': 1.209868287332938e-06, 'sampling/importance_sampling_ratio/mean': 0.9998596906661987, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9015842452645302, 'clip_ratio/low_mean': 4.282657914700394e-05, 'clip_ratio/low_min': 4.545454430626705e-06, 'clip_ratio/high_mean': 3.7368648690971895e-06, 'clip_ratio/high_max': 1.4947459476388758e-05, 'clip_ratio/region_mean': 4.656344435716164e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 531/1024 [24:15:57<21:37:52, 157.96s/it][AINFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 532/1024 [24:18:50<22:14:13, 162.71s/it][A
+                                                         [A{'loss': 0.0104, 'grad_norm': 0.002104024635627866, 'learning_rate': 1e-05, 'num_tokens': 468445132.0, 'completions/mean_length': 7298.78125, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7154.57177734375, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 15694.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2301519513130188, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021517785266041756, 'sampling/sampling_logp_difference/max': 9.872424125671387, 'sampling/importance_sampling_ratio/min': 5.157754640094936e-05, 'sampling/importance_sampling_ratio/mean': 0.9999783039093018, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9978953301906586, 'clip_ratio/low_mean': 1.8946868863167765e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.8946868863167765e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 532/1024 [24:18:50<22:14:13, 162.71s/it][AINFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 533/1024 [24:21:45<22:41:23, 166.36s/it][A
+                                                         [A{'loss': 0.0298, 'grad_norm': 0.0009346248698420823, 'learning_rate': 1e-05, 'num_tokens': 469360760.0, 'completions/mean_length': 7021.53125, 'completions/min_length': 693.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6561.08154296875, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 16003.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.20069600641727448, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020538944751024246, 'sampling/sampling_logp_difference/max': 5.8098626136779785, 'sampling/importance_sampling_ratio/min': 0.0029978419188410044, 'sampling/importance_sampling_ratio/mean': 0.9999547004699707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9539581760764122, 'clip_ratio/low_mean': 3.0451521752183908e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.441706659643387e-06, 'clip_ratio/high_max': 2.0034196040796814e-05, 'clip_ratio/region_mean': 3.689322829814046e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 533/1024 [24:21:45<22:41:23, 166.36s/it][AINFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 534/1024 [24:24:15<21:58:43, 161.48s/it][A
+                                                         [A{'loss': 0.0522, 'grad_norm': 0.002331435214728117, 'learning_rate': 1e-05, 'num_tokens': 470274859.0, 'completions/mean_length': 6988.2109375, 'completions/min_length': 1047.0, 'completions/max_length': 15370.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6988.2109375, 'completions/min_terminated_length': 1047.0, 'completions/max_terminated_length': 15370.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.23751860857009888, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02088295854628086, 'sampling/sampling_logp_difference/max': 6.460330963134766, 'sampling/importance_sampling_ratio/min': 0.0015642779180780053, 'sampling/importance_sampling_ratio/mean': 1.000002145767212, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9471191540360451, 'clip_ratio/low_mean': 3.2224923302237585e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.968734807178407e-06, 'clip_ratio/high_max': 7.874939228713629e-06, 'clip_ratio/region_mean': 3.419365827994625e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 534/1024 [24:24:15<21:58:43, 161.48s/it][AINFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 535/1024 [24:27:10<22:27:59, 165.40s/it][A
+                                                         [A{'loss': 0.0617, 'grad_norm': 0.004562230780720711, 'learning_rate': 1e-05, 'num_tokens': 471263997.0, 'completions/mean_length': 7557.453125, 'completions/min_length': 1064.0, 'completions/max_length': 16212.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7557.453125, 'completions/min_terminated_length': 1064.0, 'completions/max_terminated_length': 16212.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2511882185935974, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02160259149968624, 'sampling/sampling_logp_difference/max': 8.748924255371094, 'sampling/importance_sampling_ratio/min': 0.0001586318830959499, 'sampling/importance_sampling_ratio/mean': 1.000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9897207245230675, 'clip_ratio/low_mean': 3.8229277151913266e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0911525641386106e-06, 'clip_ratio/high_max': 1.2364610256554442e-05, 'clip_ratio/region_mean': 4.132042954552162e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 535/1024 [24:27:10<22:27:59, 165.40s/it][AINFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 536/1024 [24:29:46<22:01:27, 162.48s/it][A
+                                                         [A{'loss': 0.0219, 'grad_norm': 0.004525062162429094, 'learning_rate': 1e-05, 'num_tokens': 472120622.0, 'completions/mean_length': 6532.1953125, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6295.75244140625, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 15603.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3487703502178192, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019527796655893326, 'sampling/sampling_logp_difference/max': 11.124346733093262, 'sampling/importance_sampling_ratio/min': 1.474883083574241e-05, 'sampling/importance_sampling_ratio/mean': 0.9999650120735168, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9109068289399147, 'clip_ratio/low_mean': 5.8747830053107464e-05, 'clip_ratio/low_min': 1.3906133062846493e-05, 'clip_ratio/high_mean': 7.420082738462952e-06, 'clip_ratio/high_max': 2.6050724500237266e-05, 'clip_ratio/region_mean': 6.616791324631777e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 536/1024 [24:29:46<22:01:27, 162.48s/it][AINFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 537/1024 [24:32:16<21:29:54, 158.92s/it][A
+                                                         [A{'loss': 0.0165, 'grad_norm': 0.005058468785136938, 'learning_rate': 1e-05, 'num_tokens': 472906346.0, 'completions/mean_length': 5994.40625, 'completions/min_length': 531.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5912.5986328125, 'completions/min_terminated_length': 531.0, 'completions/max_terminated_length': 15011.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.19044627249240875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020568232983350754, 'sampling/sampling_logp_difference/max': 7.562398910522461, 'sampling/importance_sampling_ratio/min': 0.0005196271813474596, 'sampling/importance_sampling_ratio/mean': 0.9999456405639648, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9276224821805954, 'clip_ratio/low_mean': 3.90738064766083e-05, 'clip_ratio/low_min': 1.0626089533616323e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.90738064766083e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 537/1024 [24:32:16<21:29:54, 158.92s/it][AINFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 538/1024 [24:34:52<21:20:22, 158.07s/it][A
+                                                         [A{'loss': 0.1282, 'grad_norm': 0.007286665495485067, 'learning_rate': 1e-05, 'num_tokens': 473756256.0, 'completions/mean_length': 6469.046875, 'completions/min_length': 891.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6311.6669921875, 'completions/min_terminated_length': 891.0, 'completions/max_terminated_length': 15992.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.35772189497947693, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019624462351202965, 'sampling/sampling_logp_difference/max': 9.681252479553223, 'sampling/importance_sampling_ratio/min': 6.244324322324246e-05, 'sampling/importance_sampling_ratio/mean': 1.0000038146972656, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9536962807178497, 'clip_ratio/low_mean': 5.992188062009518e-05, 'clip_ratio/low_min': 1.2131874427723233e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.992188062009518e-05, 'epoch': 0.49}
+
+ 53%|█████▎    | 538/1024 [24:34:52<21:20:22, 158.07s/it][AINFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 539/1024 [24:37:03<20:10:27, 149.75s/it][A
+                                                         [A{'loss': -0.0091, 'grad_norm': 0.0031439310405403376, 'learning_rate': 1e-05, 'num_tokens': 474515194.0, 'completions/mean_length': 5778.703125, 'completions/min_length': 903.0, 'completions/max_length': 15383.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5778.703125, 'completions/min_terminated_length': 903.0, 'completions/max_terminated_length': 15383.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019796252250671387, 'sampling/sampling_logp_difference/max': 7.374977111816406, 'sampling/importance_sampling_ratio/min': 0.0006267410353757441, 'sampling/importance_sampling_ratio/mean': 1.0000576972961426, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9274095296859741, 'clip_ratio/low_mean': 3.329443018174061e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.504626536392607e-06, 'clip_ratio/high_max': 1.0018506145570427e-05, 'clip_ratio/region_mean': 3.57990563770727e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 539/1024 [24:37:03<20:10:27, 149.75s/it][AINFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 540/1024 [24:39:40<20:27:01, 152.11s/it][A
+                                                         [A{'loss': 0.0938, 'grad_norm': 0.0039032045751810074, 'learning_rate': 1e-05, 'num_tokens': 475355186.0, 'completions/mean_length': 6400.75, 'completions/min_length': 1015.0, 'completions/max_length': 16146.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6400.75, 'completions/min_terminated_length': 1015.0, 'completions/max_terminated_length': 16146.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3135277032852173, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019878748804330826, 'sampling/sampling_logp_difference/max': 12.3806791305542, 'sampling/importance_sampling_ratio/min': 4.19893694925122e-06, 'sampling/importance_sampling_ratio/mean': 0.9999880194664001, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8927748426795006, 'clip_ratio/low_mean': 4.140612338687788e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.23904565297562e-06, 'clip_ratio/high_max': 3.1761268928676145e-05, 'clip_ratio/region_mean': 5.064516949460085e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 540/1024 [24:39:40<20:27:01, 152.11s/it][AINFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 541/1024 [24:42:30<21:08:14, 157.55s/it][A
+                                                         [A{'loss': 0.0642, 'grad_norm': 0.004979084711521864, 'learning_rate': 1e-05, 'num_tokens': 476289752.0, 'completions/mean_length': 7150.234375, 'completions/min_length': 1548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6928.62451171875, 'completions/min_terminated_length': 1548.0, 'completions/max_terminated_length': 14347.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3369181156158447, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019658904522657394, 'sampling/sampling_logp_difference/max': 7.75062894821167, 'sampling/importance_sampling_ratio/min': 0.0004304716712795198, 'sampling/importance_sampling_ratio/mean': 0.9999991059303284, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8632503524422646, 'clip_ratio/low_mean': 5.609390495919797e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.227385253827379e-06, 'clip_ratio/high_max': 2.524126966818585e-05, 'clip_ratio/region_mean': 6.332129100883321e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 541/1024 [24:42:30<21:08:14, 157.55s/it][AINFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 542/1024 [24:45:29<21:56:38, 163.90s/it][A
+                                                         [A{'loss': 0.0332, 'grad_norm': 0.003560611279681325, 'learning_rate': 1e-05, 'num_tokens': 477186885.0, 'completions/mean_length': 6855.6640625, 'completions/min_length': 771.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6704.4208984375, 'completions/min_terminated_length': 771.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2743411958217621, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01880962960422039, 'sampling/sampling_logp_difference/max': 8.466726303100586, 'sampling/importance_sampling_ratio/min': 0.00021035241661593318, 'sampling/importance_sampling_ratio/mean': 0.9998643398284912, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8328540697693825, 'clip_ratio/low_mean': 3.922748987861269e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.324626497189456e-06, 'clip_ratio/high_max': 2.5298505988757825e-05, 'clip_ratio/region_mean': 4.555211648948898e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 542/1024 [24:45:29<21:56:38, 163.90s/it][AINFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 543/1024 [24:48:32<22:38:57, 169.52s/it][A
+                                                         [A{'loss': 0.0773, 'grad_norm': 0.0037869063671678305, 'learning_rate': 1e-05, 'num_tokens': 478121506.0, 'completions/mean_length': 7117.1015625, 'completions/min_length': 1067.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6818.1689453125, 'completions/min_terminated_length': 1067.0, 'completions/max_terminated_length': 15880.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2919674217700958, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0203043594956398, 'sampling/sampling_logp_difference/max': 14.937435150146484, 'sampling/importance_sampling_ratio/min': 3.256524507833092e-07, 'sampling/importance_sampling_ratio/mean': 0.9999738931655884, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9280833601951599, 'clip_ratio/low_mean': 5.487640487444878e-05, 'clip_ratio/low_min': 6.345177553157555e-06, 'clip_ratio/high_mean': 2.226903745849995e-06, 'clip_ratio/high_max': 8.90761498339998e-06, 'clip_ratio/region_mean': 5.7103308108708006e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 543/1024 [24:48:32<22:38:57, 169.52s/it][AINFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 544/1024 [24:51:03<21:50:55, 163.87s/it][A
+                                                         [A{'loss': 0.0847, 'grad_norm': 0.002787451259791851, 'learning_rate': 1e-05, 'num_tokens': 479021365.0, 'completions/mean_length': 6885.7109375, 'completions/min_length': 1184.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6734.94482421875, 'completions/min_terminated_length': 1184.0, 'completions/max_terminated_length': 16046.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02060278132557869, 'sampling/sampling_logp_difference/max': 6.589450836181641, 'sampling/importance_sampling_ratio/min': 0.0013747947523370385, 'sampling/importance_sampling_ratio/mean': 1.0000042915344238, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9137701392173767, 'clip_ratio/low_mean': 3.976425330165512e-05, 'clip_ratio/low_min': 4.979286131856497e-06, 'clip_ratio/high_mean': 3.370686670223222e-06, 'clip_ratio/high_max': 1.3482746680892888e-05, 'clip_ratio/region_mean': 4.313493991503492e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 544/1024 [24:51:03<21:50:55, 163.87s/it][AINFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 545/1024 [24:53:55<22:07:57, 166.34s/it][A
+                                                         [A{'loss': 0.0225, 'grad_norm': 0.005555091425776482, 'learning_rate': 1e-05, 'num_tokens': 479951778.0, 'completions/mean_length': 7055.7265625, 'completions/min_length': 601.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6982.275390625, 'completions/min_terminated_length': 601.0, 'completions/max_terminated_length': 15047.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02176634594798088, 'sampling/sampling_logp_difference/max': 15.100777626037598, 'sampling/importance_sampling_ratio/min': 2.7657671353154e-07, 'sampling/importance_sampling_ratio/mean': 0.9999507665634155, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1009352952241898, 'clip_ratio/low_mean': 4.93504342102824e-05, 'clip_ratio/low_min': 5.1258921303087845e-06, 'clip_ratio/high_mean': 8.077826691987866e-06, 'clip_ratio/high_max': 2.918380459959735e-05, 'clip_ratio/region_mean': 5.742826124333078e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 545/1024 [24:53:55<22:07:57, 166.34s/it][AINFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 546/1024 [24:56:21<21:17:28, 160.35s/it][A
+                                                         [A{'loss': 0.1423, 'grad_norm': 0.00568060576915741, 'learning_rate': 1e-05, 'num_tokens': 480749677.0, 'completions/mean_length': 6088.2109375, 'completions/min_length': 528.0, 'completions/max_length': 16100.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6088.2109375, 'completions/min_terminated_length': 528.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.6484375, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.6484375, 'reward_std': 0.3729842007160187, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017093103379011154, 'sampling/sampling_logp_difference/max': 8.437424659729004, 'sampling/importance_sampling_ratio/min': 0.0002166072663385421, 'sampling/importance_sampling_ratio/mean': 0.9999527931213379, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7534168809652328, 'clip_ratio/low_mean': 3.58120408918694e-05, 'clip_ratio/low_min': 5.571651399804978e-06, 'clip_ratio/high_mean': 2.43807289734832e-06, 'clip_ratio/high_max': 9.75229158939328e-06, 'clip_ratio/region_mean': 3.825011424396507e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 546/1024 [24:56:21<21:17:28, 160.35s/it][AINFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 547/1024 [24:58:50<20:47:46, 156.95s/it][A
+                                                         [A{'loss': 0.1025, 'grad_norm': 0.0019015485886484385, 'learning_rate': 1e-05, 'num_tokens': 481489954.0, 'completions/mean_length': 5638.8515625, 'completions/min_length': 1352.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5380.96826171875, 'completions/min_terminated_length': 1352.0, 'completions/max_terminated_length': 16029.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019102448597550392, 'sampling/sampling_logp_difference/max': 8.62470817565918, 'sampling/importance_sampling_ratio/min': 0.0001796126161934808, 'sampling/importance_sampling_ratio/mean': 0.999911904335022, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8868100792169571, 'clip_ratio/low_mean': 2.870424191314669e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.5532760850619525e-06, 'clip_ratio/high_max': 1.821310434024781e-05, 'clip_ratio/region_mean': 3.325751754346129e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 547/1024 [24:58:50<20:47:46, 156.95s/it][AINFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▎    | 548/1024 [25:01:28<20:47:06, 157.20s/it][A
+                                                         [A{'loss': 0.0642, 'grad_norm': 0.004203350283205509, 'learning_rate': 1e-05, 'num_tokens': 482375358.0, 'completions/mean_length': 6776.59375, 'completions/min_length': 588.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6624.095703125, 'completions/min_terminated_length': 588.0, 'completions/max_terminated_length': 15258.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019327163696289062, 'sampling/sampling_logp_difference/max': 5.6320695877075195, 'sampling/importance_sampling_ratio/min': 0.0036098493728786707, 'sampling/importance_sampling_ratio/mean': 0.9999104738235474, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9075161814689636, 'clip_ratio/low_mean': 3.169551814607985e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.0229532411758555e-06, 'clip_ratio/high_max': 2.3414544557454064e-05, 'clip_ratio/region_mean': 3.8718471842003055e-05, 'epoch': 0.5}
+
+ 54%|█████▎    | 548/1024 [25:01:28<20:47:06, 157.20s/it][AINFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▎    | 549/1024 [25:04:22<21:25:56, 162.44s/it][A
+                                                         [A{'loss': 0.0499, 'grad_norm': 0.004891456104815006, 'learning_rate': 1e-05, 'num_tokens': 483357450.0, 'completions/mean_length': 7507.59375, 'completions/min_length': 774.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7071.048828125, 'completions/min_terminated_length': 774.0, 'completions/max_terminated_length': 15684.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2772369980812073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019086822867393494, 'sampling/sampling_logp_difference/max': 5.721317291259766, 'sampling/importance_sampling_ratio/min': 0.0032753932755440474, 'sampling/importance_sampling_ratio/mean': 0.9999200701713562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8015655726194382, 'clip_ratio/low_mean': 3.6077018648938974e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.84939061809564e-06, 'clip_ratio/high_max': 1.8746226487564854e-05, 'clip_ratio/region_mean': 4.192640903966094e-05, 'epoch': 0.51}
+
+ 54%|█████▎    | 549/1024 [25:04:22<21:25:56, 162.44s/it][AINFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▎    | 550/1024 [25:07:10<21:34:52, 163.91s/it][A
+                                                         [A{'loss': 0.028, 'grad_norm': 0.003564947983250022, 'learning_rate': 1e-05, 'num_tokens': 484153554.0, 'completions/mean_length': 6061.3125, 'completions/min_length': 627.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5813.568359375, 'completions/min_terminated_length': 627.0, 'completions/max_terminated_length': 16107.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018360145390033722, 'sampling/sampling_logp_difference/max': 3.908921003341675, 'sampling/importance_sampling_ratio/min': 0.02006213553249836, 'sampling/importance_sampling_ratio/mean': 0.9999876022338867, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8335569724440575, 'clip_ratio/low_mean': 3.096040018135682e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.011492757806991e-06, 'clip_ratio/high_max': 2.4045971031227964e-05, 'clip_ratio/region_mean': 3.697189299600723e-05, 'epoch': 0.51}
+
+ 54%|█████▎    | 550/1024 [25:07:10<21:34:52, 163.91s/it][AINFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 551/1024 [25:10:11<22:11:51, 168.95s/it][A
+                                                         [A{'loss': 0.0976, 'grad_norm': 0.0032013265881687403, 'learning_rate': 1e-05, 'num_tokens': 485111601.0, 'completions/mean_length': 7312.4921875, 'completions/min_length': 588.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7241.06298828125, 'completions/min_terminated_length': 588.0, 'completions/max_terminated_length': 15957.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.21040895581245422, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020712960511446, 'sampling/sampling_logp_difference/max': 5.0278730392456055, 'sampling/importance_sampling_ratio/min': 0.006552733480930328, 'sampling/importance_sampling_ratio/mean': 0.9999306201934814, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9900097697973251, 'clip_ratio/low_mean': 4.612986276697484e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2738347524864366e-06, 'clip_ratio/high_max': 9.095339009945747e-06, 'clip_ratio/region_mean': 4.840369865632965e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 551/1024 [25:10:11<22:11:51, 168.95s/it][AINFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 552/1024 [25:12:33<21:06:55, 161.05s/it][A
+                                                         [A{'loss': 0.0888, 'grad_norm': 0.002972986316308379, 'learning_rate': 1e-05, 'num_tokens': 485971554.0, 'completions/mean_length': 6571.4453125, 'completions/min_length': 951.0, 'completions/max_length': 14797.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6571.4453125, 'completions/min_terminated_length': 951.0, 'completions/max_terminated_length': 14797.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020055105909705162, 'sampling/sampling_logp_difference/max': 10.613155364990234, 'sampling/importance_sampling_ratio/min': 2.4590379325672984e-05, 'sampling/importance_sampling_ratio/mean': 0.9998995065689087, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8801060244441032, 'clip_ratio/low_mean': 4.3424448904261226e-05, 'clip_ratio/low_min': 4.718405762105249e-06, 'clip_ratio/high_mean': 4.2937051603075815e-06, 'clip_ratio/high_max': 1.360053283860907e-05, 'clip_ratio/region_mean': 4.771815429194248e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 552/1024 [25:12:33<21:06:55, 161.05s/it][AINFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 553/1024 [25:15:12<20:59:42, 160.47s/it][A
+                                                         [A{'loss': 0.0278, 'grad_norm': 0.00798189826309681, 'learning_rate': 1e-05, 'num_tokens': 486873791.0, 'completions/mean_length': 6879.2890625, 'completions/min_length': 430.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6728.4208984375, 'completions/min_terminated_length': 430.0, 'completions/max_terminated_length': 16243.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.22673210501670837, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02010834403336048, 'sampling/sampling_logp_difference/max': 5.25710916519165, 'sampling/importance_sampling_ratio/min': 0.005210345610976219, 'sampling/importance_sampling_ratio/mean': 0.9999493956565857, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8452998399734497, 'clip_ratio/low_mean': 3.511405452627514e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.057813901501504e-06, 'clip_ratio/high_max': 8.231255606006016e-06, 'clip_ratio/region_mean': 3.71718685983069e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 553/1024 [25:15:12<20:59:42, 160.47s/it][AINFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 554/1024 [25:17:52<20:54:13, 160.11s/it][A
+                                                         [A{'loss': 0.0389, 'grad_norm': 0.0038661460857838392, 'learning_rate': 1e-05, 'num_tokens': 487814936.0, 'completions/mean_length': 7169.8828125, 'completions/min_length': 694.0, 'completions/max_length': 16237.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7169.8828125, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 16237.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.23751862347126007, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02097059041261673, 'sampling/sampling_logp_difference/max': 9.96898078918457, 'sampling/importance_sampling_ratio/min': 4.6830271458020434e-05, 'sampling/importance_sampling_ratio/mean': 0.9999849796295166, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9671438857913017, 'clip_ratio/low_mean': 6.0756912262149854e-05, 'clip_ratio/low_min': 1.0878021839744179e-05, 'clip_ratio/high_mean': 4.394269467411505e-06, 'clip_ratio/high_max': 1.757707786964602e-05, 'clip_ratio/region_mean': 6.51511809337535e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 554/1024 [25:17:52<20:54:13, 160.11s/it][AINFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 555/1024 [25:20:26<20:38:25, 158.43s/it][A
+                                                         [A{'loss': 0.0252, 'grad_norm': 0.002214127918705344, 'learning_rate': 1e-05, 'num_tokens': 488720293.0, 'completions/mean_length': 6945.0390625, 'completions/min_length': 940.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6870.71630859375, 'completions/min_terminated_length': 940.0, 'completions/max_terminated_length': 15458.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01968962326645851, 'sampling/sampling_logp_difference/max': 8.04468059539795, 'sampling/importance_sampling_ratio/min': 0.00032080389792099595, 'sampling/importance_sampling_ratio/mean': 0.9999914169311523, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9309702143073082, 'clip_ratio/low_mean': 3.180719090778439e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1623150157902273e-06, 'clip_ratio/high_max': 4.649260063160909e-06, 'clip_ratio/region_mean': 3.2969506037261453e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 555/1024 [25:20:26<20:38:25, 158.43s/it][AINFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 556/1024 [25:22:57<20:18:40, 156.24s/it][A
+                                                         [A{'loss': 0.0919, 'grad_norm': 0.0026088031008839607, 'learning_rate': 1e-05, 'num_tokens': 489504626.0, 'completions/mean_length': 5970.1015625, 'completions/min_length': 610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5804.8017578125, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3237725496292114, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018132124096155167, 'sampling/sampling_logp_difference/max': 7.999942779541016, 'sampling/importance_sampling_ratio/min': 0.00033548183273524046, 'sampling/importance_sampling_ratio/mean': 0.9999892711639404, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8274230882525444, 'clip_ratio/low_mean': 5.9988536690980254e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.857000706375402e-06, 'clip_ratio/high_max': 1.5428002825501608e-05, 'clip_ratio/region_mean': 6.384553716998198e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 556/1024 [25:22:57<20:18:40, 156.24s/it][AINFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 557/1024 [25:25:45<20:44:11, 159.85s/it][A
+                                                         [A{'loss': 0.0021, 'grad_norm': 0.0040014018304646015, 'learning_rate': 1e-05, 'num_tokens': 490431156.0, 'completions/mean_length': 7099.578125, 'completions/min_length': 567.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6952.20654296875, 'completions/min_terminated_length': 567.0, 'completions/max_terminated_length': 15636.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.25460803508758545, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02036934345960617, 'sampling/sampling_logp_difference/max': 7.249959468841553, 'sampling/importance_sampling_ratio/min': 0.0007102031959220767, 'sampling/importance_sampling_ratio/mean': 0.9999368786811829, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8690815567970276, 'clip_ratio/low_mean': 3.257978141846252e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.032566036788921e-06, 'clip_ratio/high_max': 1.628765676287003e-05, 'clip_ratio/region_mean': 3.761234722787776e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 557/1024 [25:25:45<20:44:11, 159.85s/it][AINFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 558/1024 [25:28:42<21:19:19, 164.72s/it][A
+                                                         [A{'loss': 0.0711, 'grad_norm': 0.002252641599625349, 'learning_rate': 1e-05, 'num_tokens': 491378450.0, 'completions/mean_length': 7253.296875, 'completions/min_length': 727.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6725.07421875, 'completions/min_terminated_length': 727.0, 'completions/max_terminated_length': 16301.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01926814392209053, 'sampling/sampling_logp_difference/max': 10.87448501586914, 'sampling/importance_sampling_ratio/min': 1.893525586638134e-05, 'sampling/importance_sampling_ratio/mean': 0.9999855756759644, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8692722395062447, 'clip_ratio/low_mean': 3.747020150512981e-05, 'clip_ratio/low_min': 3.852436293527717e-06, 'clip_ratio/high_mean': 3.3287286100858182e-06, 'clip_ratio/high_max': 1.3314914440343273e-05, 'clip_ratio/region_mean': 4.079892983099853e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 558/1024 [25:28:42<21:19:19, 164.72s/it][AINFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 559/1024 [25:31:40<21:49:03, 168.91s/it][A
+                                                         [A{'loss': 0.0684, 'grad_norm': 0.0023995323572307825, 'learning_rate': 1e-05, 'num_tokens': 492398757.0, 'completions/mean_length': 7827.0234375, 'completions/min_length': 808.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7406.18798828125, 'completions/min_terminated_length': 808.0, 'completions/max_terminated_length': 15865.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.26826781034469604, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020725054666399956, 'sampling/sampling_logp_difference/max': 7.951230525970459, 'sampling/importance_sampling_ratio/min': 0.0003522284678183496, 'sampling/importance_sampling_ratio/mean': 0.9999961256980896, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9718392416834831, 'clip_ratio/low_mean': 3.905345306520758e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0567253070803417e-05, 'clip_ratio/high_max': 3.51339258486405e-05, 'clip_ratio/region_mean': 4.962070602232416e-05, 'epoch': 0.51}
+
+ 55%|█████▍    | 559/1024 [25:31:40<21:49:03, 168.91s/it][AINFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 560/1024 [25:34:12<21:05:35, 163.65s/it][A
+                                                         [A{'loss': 0.0298, 'grad_norm': 0.0053934333845973015, 'learning_rate': 1e-05, 'num_tokens': 493259049.0, 'completions/mean_length': 6578.53125, 'completions/min_length': 80.0, 'completions/max_length': 14833.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6578.53125, 'completions/min_terminated_length': 80.0, 'completions/max_terminated_length': 14833.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019497254863381386, 'sampling/sampling_logp_difference/max': 13.345943450927734, 'sampling/importance_sampling_ratio/min': 1.5993017541404697e-06, 'sampling/importance_sampling_ratio/mean': 0.999976396560669, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9265799149870872, 'clip_ratio/low_mean': 4.477454979223694e-05, 'clip_ratio/low_min': 3.5987793580716243e-06, 'clip_ratio/high_mean': 2.3092504193300556e-06, 'clip_ratio/high_max': 9.237001677320222e-06, 'clip_ratio/region_mean': 4.708380049578409e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 560/1024 [25:34:12<21:05:35, 163.65s/it][AINFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 561/1024 [25:37:03<21:20:12, 165.90s/it][A
+                                                         [A{'loss': 0.061, 'grad_norm': 0.003773769596591592, 'learning_rate': 1e-05, 'num_tokens': 494288028.0, 'completions/mean_length': 7893.7734375, 'completions/min_length': 763.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7826.92138671875, 'completions/min_terminated_length': 763.0, 'completions/max_terminated_length': 15783.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.29272884130477905, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020743828266859055, 'sampling/sampling_logp_difference/max': 9.982173919677734, 'sampling/importance_sampling_ratio/min': 4.6216489863581955e-05, 'sampling/importance_sampling_ratio/mean': 1.0000444650650024, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9697273746132851, 'clip_ratio/low_mean': 4.2538599473118666e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.580789669082151e-06, 'clip_ratio/high_max': 6.991247119003674e-06, 'clip_ratio/region_mean': 4.511938891482714e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 561/1024 [25:37:03<21:20:12, 165.90s/it][AINFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 562/1024 [25:39:55<21:32:50, 167.90s/it][A
+                                                         [A{'loss': 0.0217, 'grad_norm': 0.006334445904940367, 'learning_rate': 1e-05, 'num_tokens': 495135903.0, 'completions/mean_length': 6483.7734375, 'completions/min_length': 1030.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6405.81884765625, 'completions/min_terminated_length': 1030.0, 'completions/max_terminated_length': 15024.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.20251333713531494, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018669776618480682, 'sampling/sampling_logp_difference/max': 8.99797248840332, 'sampling/importance_sampling_ratio/min': 0.0001236602693097666, 'sampling/importance_sampling_ratio/mean': 0.9999064207077026, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8293593674898148, 'clip_ratio/low_mean': 3.2997783137034276e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.29665919909894e-06, 'clip_ratio/high_max': 1.060595786839258e-05, 'clip_ratio/region_mean': 3.729444244982005e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 562/1024 [25:39:55<21:32:50, 167.90s/it][AINFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 563/1024 [25:42:43<21:29:09, 167.79s/it][A
+                                                         [A{'loss': 0.0865, 'grad_norm': 0.003286323742941022, 'learning_rate': 1e-05, 'num_tokens': 495986277.0, 'completions/mean_length': 6484.546875, 'completions/min_length': 630.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6246.96044921875, 'completions/min_terminated_length': 630.0, 'completions/max_terminated_length': 16230.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.3763991594314575, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018656805157661438, 'sampling/sampling_logp_difference/max': 10.809014320373535, 'sampling/importance_sampling_ratio/min': 2.0216441043885425e-05, 'sampling/importance_sampling_ratio/mean': 0.999945342540741, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7686850279569626, 'clip_ratio/low_mean': 4.667806888392079e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3393192236653704e-06, 'clip_ratio/high_max': 9.357276894661481e-06, 'clip_ratio/region_mean': 4.901738748230855e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 563/1024 [25:42:43<21:29:09, 167.79s/it][AINFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 564/1024 [25:45:27<21:18:53, 166.81s/it][A
+                                                         [A{'loss': -0.0049, 'grad_norm': 0.005072349216789007, 'learning_rate': 1e-05, 'num_tokens': 496826094.0, 'completions/mean_length': 6411.3203125, 'completions/min_length': 952.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5746.47509765625, 'completions/min_terminated_length': 952.0, 'completions/max_terminated_length': 15720.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019648944959044456, 'sampling/sampling_logp_difference/max': 5.5721211433410645, 'sampling/importance_sampling_ratio/min': 0.0038024066016077995, 'sampling/importance_sampling_ratio/mean': 0.9999135732650757, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.899998240172863, 'clip_ratio/low_mean': 8.26880966542376e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.76577109668142e-06, 'clip_ratio/high_max': 3.368905208844808e-05, 'clip_ratio/region_mean': 1.8034580989478854e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 564/1024 [25:45:27<21:18:53, 166.81s/it][AINFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 565/1024 [25:48:34<22:02:38, 172.89s/it][A
+                                                         [A{'loss': 0.0871, 'grad_norm': 0.005030680447816849, 'learning_rate': 1e-05, 'num_tokens': 497756469.0, 'completions/mean_length': 7110.0546875, 'completions/min_length': 686.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6810.89501953125, 'completions/min_terminated_length': 686.0, 'completions/max_terminated_length': 16300.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3253750801086426, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02187274768948555, 'sampling/sampling_logp_difference/max': 7.749985218048096, 'sampling/importance_sampling_ratio/min': 0.0004307488852646202, 'sampling/importance_sampling_ratio/mean': 0.999985933303833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0061073675751686, 'clip_ratio/low_mean': 4.834715275592316e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.2551004020861e-06, 'clip_ratio/high_max': 1.726673963275971e-05, 'clip_ratio/region_mean': 5.4602252930635586e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 565/1024 [25:48:34<22:02:38, 172.89s/it][AINFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 566/1024 [25:51:21<21:44:25, 170.89s/it][A
+                                                         [A{'loss': -0.0016, 'grad_norm': 0.002894402015954256, 'learning_rate': 1e-05, 'num_tokens': 498743411.0, 'completions/mean_length': 7546.484375, 'completions/min_length': 405.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7261.40283203125, 'completions/min_terminated_length': 405.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.2380426526069641, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019597206264734268, 'sampling/sampling_logp_difference/max': 10.306904792785645, 'sampling/importance_sampling_ratio/min': 3.340166585985571e-05, 'sampling/importance_sampling_ratio/mean': 0.9998988509178162, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.898541085422039, 'clip_ratio/low_mean': 2.627351494766117e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.480095630147844e-07, 'clip_ratio/high_max': 3.3920382520591374e-06, 'clip_ratio/region_mean': 2.712152416961544e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 566/1024 [25:51:21<21:44:25, 170.89s/it][AINFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 567/1024 [25:54:09<21:35:22, 170.07s/it][A
+                                                         [A{'loss': 0.0352, 'grad_norm': 0.0033100086729973555, 'learning_rate': 1e-05, 'num_tokens': 499612490.0, 'completions/mean_length': 6637.9296875, 'completions/min_length': 340.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6241.74755859375, 'completions/min_terminated_length': 340.0, 'completions/max_terminated_length': 15426.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2782978415489197, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019627269357442856, 'sampling/sampling_logp_difference/max': 8.448633193969727, 'sampling/importance_sampling_ratio/min': 0.000214192972634919, 'sampling/importance_sampling_ratio/mean': 0.9999792575836182, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9469815120100975, 'clip_ratio/low_mean': 1.9815101950371172e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.518700269632973e-07, 'clip_ratio/high_max': 3.407480107853189e-06, 'clip_ratio/region_mean': 2.066697197733447e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 567/1024 [25:54:09<21:35:22, 170.07s/it][AINFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▌    | 568/1024 [25:56:55<21:23:47, 168.92s/it][A
+                                                         [A{'loss': 0.0543, 'grad_norm': 0.006571728736162186, 'learning_rate': 1e-05, 'num_tokens': 500515117.0, 'completions/mean_length': 6903.0859375, 'completions/min_length': 602.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6752.595703125, 'completions/min_terminated_length': 602.0, 'completions/max_terminated_length': 15136.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020653847604990005, 'sampling/sampling_logp_difference/max': 4.107652187347412, 'sampling/importance_sampling_ratio/min': 0.016446342691779137, 'sampling/importance_sampling_ratio/mean': 0.9999945163726807, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.976447619497776, 'clip_ratio/low_mean': 6.551078422489809e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.2405809419251455e-06, 'clip_ratio/high_max': 2.8962323767700582e-05, 'clip_ratio/region_mean': 7.275136522366665e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 568/1024 [25:56:55<21:23:47, 168.92s/it][AINFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 569/1024 [25:59:41<21:14:02, 168.01s/it][A
+                                                         [A{'loss': 0.0618, 'grad_norm': 0.007468517404049635, 'learning_rate': 1e-05, 'num_tokens': 501427056.0, 'completions/mean_length': 6953.8359375, 'completions/min_length': 88.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6570.49560546875, 'completions/min_terminated_length': 88.0, 'completions/max_terminated_length': 15556.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3571978807449341, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01937997341156006, 'sampling/sampling_logp_difference/max': 8.562470436096191, 'sampling/importance_sampling_ratio/min': 0.0001911464933073148, 'sampling/importance_sampling_ratio/mean': 1.0000053644180298, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8397975340485573, 'clip_ratio/low_mean': 7.513643731726916e-05, 'clip_ratio/low_min': 2.2551557776750997e-05, 'clip_ratio/high_mean': 3.6441037991608027e-06, 'clip_ratio/high_max': 1.4576415196643211e-05, 'clip_ratio/region_mean': 7.878054020693526e-05, 'epoch': 0.52}
+
+ 56%|█████▌    | 569/1024 [25:59:41<21:14:02, 168.01s/it][AINFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 570/1024 [26:02:41<21:37:37, 171.49s/it][A
+                                                         [A{'loss': 0.0431, 'grad_norm': 0.004324767272919416, 'learning_rate': 1e-05, 'num_tokens': 502445156.0, 'completions/mean_length': 7807.09375, 'completions/min_length': 562.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7458.43896484375, 'completions/min_terminated_length': 562.0, 'completions/max_terminated_length': 15961.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.3329663574695587, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018592730164527893, 'sampling/sampling_logp_difference/max': 10.418506622314453, 'sampling/importance_sampling_ratio/min': 2.9874459869461134e-05, 'sampling/importance_sampling_ratio/mean': 0.9999243021011353, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7974586114287376, 'clip_ratio/low_mean': 3.7468206755875144e-05, 'clip_ratio/low_min': 5.264044375508092e-06, 'clip_ratio/high_mean': 7.922306224372733e-06, 'clip_ratio/high_max': 3.168922489749093e-05, 'clip_ratio/region_mean': 4.5390514060272835e-05, 'epoch': 0.52}
+
+ 56%|█████▌    | 570/1024 [26:02:41<21:37:37, 171.49s/it][AINFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 571/1024 [26:05:30<21:29:19, 170.77s/it][A
+                                                         [A{'loss': 0.0434, 'grad_norm': 0.0044867550022900105, 'learning_rate': 1e-05, 'num_tokens': 503293398.0, 'completions/mean_length': 6467.890625, 'completions/min_length': 874.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6310.4921875, 'completions/min_terminated_length': 874.0, 'completions/max_terminated_length': 16133.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2998581528663635, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019022464752197266, 'sampling/sampling_logp_difference/max': 3.6936450004577637, 'sampling/importance_sampling_ratio/min': 0.024881144985556602, 'sampling/importance_sampling_ratio/mean': 0.999916136264801, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8665193468332291, 'clip_ratio/low_mean': 3.436269958001503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.567038670051261e-06, 'clip_ratio/high_max': 1.8414293663227e-05, 'clip_ratio/region_mean': 3.9929738250066293e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 571/1024 [26:05:30<21:29:19, 170.77s/it][AINFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 572/1024 [26:08:32<21:52:04, 174.17s/it][A
+                                                         [A{'loss': 0.0041, 'grad_norm': 0.0033805551938712597, 'learning_rate': 1e-05, 'num_tokens': 504115692.0, 'completions/mean_length': 6275.796875, 'completions/min_length': 517.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6115.349609375, 'completions/min_terminated_length': 517.0, 'completions/max_terminated_length': 16309.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2569621503353119, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018935590982437134, 'sampling/sampling_logp_difference/max': 3.9959733486175537, 'sampling/importance_sampling_ratio/min': 0.018389537930488586, 'sampling/importance_sampling_ratio/mean': 1.0000152587890625, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8425783589482307, 'clip_ratio/low_mean': 3.597185968828853e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.711462454702996e-06, 'clip_ratio/high_max': 1.4845849818811985e-05, 'clip_ratio/region_mean': 3.968332202930469e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 572/1024 [26:08:32<21:52:04, 174.17s/it][AINFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 573/1024 [26:11:14<21:23:14, 170.72s/it][A
+                                                         [A{'loss': 0.0695, 'grad_norm': 0.00652205478399992, 'learning_rate': 1e-05, 'num_tokens': 504826577.0, 'completions/mean_length': 5396.7890625, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5222.38916015625, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 16116.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018737314268946648, 'sampling/sampling_logp_difference/max': 6.373790740966797, 'sampling/importance_sampling_ratio/min': 0.0017056812066584826, 'sampling/importance_sampling_ratio/mean': 0.9999775886535645, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8558806329965591, 'clip_ratio/low_mean': 1.670091853611666e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3471904480866215e-05, 'clip_ratio/high_max': 4.3129479763592826e-05, 'clip_ratio/region_mean': 3.0172822903296037e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 573/1024 [26:11:14<21:23:14, 170.72s/it][AINFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 574/1024 [26:14:25<22:04:14, 176.57s/it][A
+                                                         [A{'loss': 0.0698, 'grad_norm': 0.0018958896398544312, 'learning_rate': 1e-05, 'num_tokens': 505846438.0, 'completions/mean_length': 7798.9765625, 'completions/min_length': 319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6991.837890625, 'completions/min_terminated_length': 319.0, 'completions/max_terminated_length': 16298.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.21253062784671783, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019361287355422974, 'sampling/sampling_logp_difference/max': 10.623047828674316, 'sampling/importance_sampling_ratio/min': 2.434831731079612e-05, 'sampling/importance_sampling_ratio/mean': 0.9999515414237976, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8846152648329735, 'clip_ratio/low_mean': 2.3435458388121333e-05, 'clip_ratio/low_min': 3.954319709009724e-06, 'clip_ratio/high_mean': 1.728673169054673e-06, 'clip_ratio/high_max': 6.914692676218692e-06, 'clip_ratio/region_mean': 2.5164132239297032e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 574/1024 [26:14:25<22:04:14, 176.57s/it][AINFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 575/1024 [26:17:12<21:41:14, 173.88s/it][A
+                                                         [A{'loss': 0.1295, 'grad_norm': 0.003035407979041338, 'learning_rate': 1e-05, 'num_tokens': 506670477.0, 'completions/mean_length': 6272.5546875, 'completions/min_length': 901.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6029.88037109375, 'completions/min_terminated_length': 901.0, 'completions/max_terminated_length': 16280.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019988738000392914, 'sampling/sampling_logp_difference/max': 6.716870307922363, 'sampling/importance_sampling_ratio/min': 0.0012103202752768993, 'sampling/importance_sampling_ratio/mean': 0.9999212026596069, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9714803844690323, 'clip_ratio/low_mean': 5.590463968019321e-05, 'clip_ratio/low_min': 4.822531082027126e-06, 'clip_ratio/high_mean': 5.064732249593362e-06, 'clip_ratio/high_max': 1.085428675651201e-05, 'clip_ratio/region_mean': 6.096937283928128e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 575/1024 [26:17:12<21:41:14, 173.88s/it][AINFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▋    | 576/1024 [26:19:59<21:22:16, 171.73s/it][A
+                                                         [A{'loss': 0.06, 'grad_norm': 0.005080445669591427, 'learning_rate': 1e-05, 'num_tokens': 507471717.0, 'completions/mean_length': 6060.75, 'completions/min_length': 593.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5896.88916015625, 'completions/min_terminated_length': 593.0, 'completions/max_terminated_length': 16115.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3135228157043457, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019146449863910675, 'sampling/sampling_logp_difference/max': 5.961174488067627, 'sampling/importance_sampling_ratio/min': 0.0025768836494535208, 'sampling/importance_sampling_ratio/mean': 0.9999859929084778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8791732639074326, 'clip_ratio/low_mean': 4.479086726405512e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.294149900691991e-06, 'clip_ratio/high_max': 2.1176599602767965e-05, 'clip_ratio/region_mean': 5.008501784686814e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 576/1024 [26:19:59<21:22:16, 171.73s/it][AINFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▋    | 577/1024 [26:22:47<21:12:08, 170.76s/it][A
+                                                         [A{'loss': 0.0263, 'grad_norm': 0.002491918858140707, 'learning_rate': 1e-05, 'num_tokens': 508420417.0, 'completions/mean_length': 7221.65625, 'completions/min_length': 1071.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7149.51171875, 'completions/min_terminated_length': 1071.0, 'completions/max_terminated_length': 16319.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.22908622026443481, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019857721403241158, 'sampling/sampling_logp_difference/max': 6.906219959259033, 'sampling/importance_sampling_ratio/min': 0.0010015364969149232, 'sampling/importance_sampling_ratio/mean': 0.9999144077301025, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9068904295563698, 'clip_ratio/low_mean': 3.991827338722942e-05, 'clip_ratio/low_min': 4.394445568323135e-06, 'clip_ratio/high_mean': 3.978321103659255e-06, 'clip_ratio/high_max': 1.591328441463702e-05, 'clip_ratio/region_mean': 4.389659511616628e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 577/1024 [26:22:47<21:12:08, 170.76s/it][AINFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▋    | 578/1024 [26:25:40<21:13:21, 171.30s/it][A
+                                                         [A{'loss': 0.1167, 'grad_norm': 0.0038857783656567335, 'learning_rate': 1e-05, 'num_tokens': 509367579.0, 'completions/mean_length': 7279.765625, 'completions/min_length': 754.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6909.67431640625, 'completions/min_terminated_length': 754.0, 'completions/max_terminated_length': 16090.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.3782213628292084, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01783195324242115, 'sampling/sampling_logp_difference/max': 9.374939918518066, 'sampling/importance_sampling_ratio/min': 8.482332486892119e-05, 'sampling/importance_sampling_ratio/mean': 0.9999372959136963, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7393763959407806, 'clip_ratio/low_mean': 4.729307283923845e-05, 'clip_ratio/low_min': 3.3817600524344016e-06, 'clip_ratio/high_mean': 6.80946584452613e-07, 'clip_ratio/high_max': 2.723786337810452e-06, 'clip_ratio/region_mean': 4.7974018798413454e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 578/1024 [26:25:40<21:13:21, 171.30s/it][AINFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 579/1024 [26:27:57<19:53:39, 160.94s/it][A
+                                                         [A{'loss': 0.1534, 'grad_norm': 0.004505726508796215, 'learning_rate': 1e-05, 'num_tokens': 510076403.0, 'completions/mean_length': 5381.4375, 'completions/min_length': 1030.0, 'completions/max_length': 15946.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5381.4375, 'completions/min_terminated_length': 1030.0, 'completions/max_terminated_length': 15946.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3861297369003296, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.019285976886749268, 'sampling/sampling_logp_difference/max': 6.124998569488525, 'sampling/importance_sampling_ratio/min': 0.0021874941885471344, 'sampling/importance_sampling_ratio/mean': 0.9999825358390808, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8337196409702301, 'clip_ratio/low_mean': 5.770765028501046e-05, 'clip_ratio/low_min': 6.032236342434771e-06, 'clip_ratio/high_mean': 6.067322146918741e-06, 'clip_ratio/high_max': 2.4269288587674964e-05, 'clip_ratio/region_mean': 6.377497174980817e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 579/1024 [26:27:57<19:53:39, 160.94s/it][AINFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 580/1024 [26:30:58<20:35:17, 166.93s/it][A
+                                                         [A{'loss': 0.0288, 'grad_norm': 0.0039497604593634605, 'learning_rate': 1e-05, 'num_tokens': 511177974.0, 'completions/mean_length': 8440.7109375, 'completions/min_length': 472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 8250.072265625, 'completions/min_terminated_length': 472.0, 'completions/max_terminated_length': 15789.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.18990950286388397, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020451124757528305, 'sampling/sampling_logp_difference/max': 8.424702644348145, 'sampling/importance_sampling_ratio/min': 0.00021938055579084903, 'sampling/importance_sampling_ratio/mean': 0.999910831451416, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8920768201351166, 'clip_ratio/low_mean': 4.1738339632502175e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.577795834848075e-06, 'clip_ratio/high_max': 1.83111833393923e-05, 'clip_ratio/region_mean': 4.631613546735025e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 580/1024 [26:30:58<20:35:17, 166.93s/it][AINFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 581/1024 [26:33:44<20:31:53, 166.85s/it][A
+                                                         [A{'loss': 0.0866, 'grad_norm': 0.0024386425502598286, 'learning_rate': 1e-05, 'num_tokens': 512054655.0, 'completions/mean_length': 6702.3828125, 'completions/min_length': 1169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6470.0244140625, 'completions/min_terminated_length': 1169.0, 'completions/max_terminated_length': 16077.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.26645052433013916, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018986206501722336, 'sampling/sampling_logp_difference/max': 6.486593246459961, 'sampling/importance_sampling_ratio/min': 0.0015237311599776149, 'sampling/importance_sampling_ratio/mean': 1.0000202655792236, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8600481152534485, 'clip_ratio/low_mean': 4.171912905803765e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.427778835884965e-06, 'clip_ratio/high_max': 1.371111534353986e-05, 'clip_ratio/region_mean': 4.514690772339236e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 581/1024 [26:33:44<20:31:53, 166.85s/it][AINFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 582/1024 [26:36:06<19:33:56, 159.36s/it][A
+                                                         [A{'loss': 0.0617, 'grad_norm': 0.0072782449424266815, 'learning_rate': 1e-05, 'num_tokens': 512696537.0, 'completions/mean_length': 4845.953125, 'completions/min_length': 160.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4755.1025390625, 'completions/min_terminated_length': 160.0, 'completions/max_terminated_length': 13410.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01862735114991665, 'sampling/sampling_logp_difference/max': 4.027317047119141, 'sampling/importance_sampling_ratio/min': 0.017822081223130226, 'sampling/importance_sampling_ratio/mean': 0.9999409317970276, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9067303538322449, 'clip_ratio/low_mean': 2.6773893978315755e-05, 'clip_ratio/low_min': 4.736104074254399e-06, 'clip_ratio/high_mean': 4.2680171645770315e-06, 'clip_ratio/high_max': 9.279537152906414e-06, 'clip_ratio/region_mean': 3.1041911142892786e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 582/1024 [26:36:06<19:33:56, 159.36s/it][AINFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 583/1024 [26:38:33<19:03:22, 155.56s/it][A
+                                                         [A{'loss': 0.0799, 'grad_norm': 0.005057654343545437, 'learning_rate': 1e-05, 'num_tokens': 513505135.0, 'completions/mean_length': 6173.171875, 'completions/min_length': 756.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6011.095703125, 'completions/min_terminated_length': 756.0, 'completions/max_terminated_length': 16282.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2767051160335541, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020879898220300674, 'sampling/sampling_logp_difference/max': 8.342979431152344, 'sampling/importance_sampling_ratio/min': 0.0002380619989708066, 'sampling/importance_sampling_ratio/mean': 0.9999635219573975, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9604142308235168, 'clip_ratio/low_mean': 4.360654588708712e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.104518898704555e-06, 'clip_ratio/high_max': 8.41807559481822e-06, 'clip_ratio/region_mean': 4.5711064331044327e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 583/1024 [26:38:33<19:03:22, 155.56s/it][AINFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 584/1024 [26:40:59<18:39:54, 152.71s/it][A
+                                                         [A{'loss': 0.0991, 'grad_norm': 0.0047672707587480545, 'learning_rate': 1e-05, 'num_tokens': 514232058.0, 'completions/mean_length': 5546.5234375, 'completions/min_length': 1113.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5374.50048828125, 'completions/min_terminated_length': 1113.0, 'completions/max_terminated_length': 15173.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.27038949728012085, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018185433000326157, 'sampling/sampling_logp_difference/max': 9.74951171875, 'sampling/importance_sampling_ratio/min': 5.8323133998783305e-05, 'sampling/importance_sampling_ratio/mean': 0.9999624490737915, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8015405982732773, 'clip_ratio/low_mean': 4.2579683963595016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.227510205761064e-06, 'clip_ratio/high_max': 7.327939783863258e-06, 'clip_ratio/region_mean': 4.580719428304292e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 584/1024 [26:40:59<18:39:54, 152.71s/it][AINFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 585/1024 [26:43:37<18:49:10, 154.33s/it][A
+                                                         [A{'loss': 0.0453, 'grad_norm': 0.005850035231560469, 'learning_rate': 1e-05, 'num_tokens': 515103184.0, 'completions/mean_length': 6637.359375, 'completions/min_length': 1144.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6482.6513671875, 'completions/min_terminated_length': 1144.0, 'completions/max_terminated_length': 15778.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.24988999962806702, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020641878247261047, 'sampling/sampling_logp_difference/max': 15.747965812683105, 'sampling/importance_sampling_ratio/min': 1.4479226706498594e-07, 'sampling/importance_sampling_ratio/mean': 0.999963104724884, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0173144191503525, 'clip_ratio/low_mean': 5.04182496570138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.388961428958282e-06, 'clip_ratio/high_max': 1.3804907666781219e-05, 'clip_ratio/region_mean': 5.480721097228525e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 585/1024 [26:43:37<18:49:10, 154.33s/it][AINFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 586/1024 [26:46:16<18:55:55, 155.61s/it][A
+                                                         [A{'loss': 0.0831, 'grad_norm': 0.0037875184789299965, 'learning_rate': 1e-05, 'num_tokens': 516009791.0, 'completions/mean_length': 6940.6171875, 'completions/min_length': 1273.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6866.259765625, 'completions/min_terminated_length': 1273.0, 'completions/max_terminated_length': 15716.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.27222442626953125, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018527517095208168, 'sampling/sampling_logp_difference/max': 12.062490463256836, 'sampling/importance_sampling_ratio/min': 5.772008080384694e-06, 'sampling/importance_sampling_ratio/mean': 0.9999997615814209, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8547529205679893, 'clip_ratio/low_mean': 5.566071547491447e-05, 'clip_ratio/low_min': 8.978264304460026e-06, 'clip_ratio/high_mean': 3.986071760664345e-06, 'clip_ratio/high_max': 1.594428704265738e-05, 'clip_ratio/region_mean': 5.964678746295249e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 586/1024 [26:46:16<18:55:55, 155.61s/it][AINFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 587/1024 [26:48:54<18:59:05, 156.40s/it][A
+                                                         [A{'loss': 0.0502, 'grad_norm': 0.0015506440540775657, 'learning_rate': 1e-05, 'num_tokens': 516903335.0, 'completions/mean_length': 6837.125, 'completions/min_length': 1319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6761.95263671875, 'completions/min_terminated_length': 1319.0, 'completions/max_terminated_length': 15387.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.20593318343162537, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020130250602960587, 'sampling/sampling_logp_difference/max': 10.0628080368042, 'sampling/importance_sampling_ratio/min': 4.2636147554730996e-05, 'sampling/importance_sampling_ratio/mean': 0.9999232292175293, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9027494043111801, 'clip_ratio/low_mean': 3.340147941344185e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.731095721879683e-06, 'clip_ratio/high_max': 6.924382887518732e-06, 'clip_ratio/region_mean': 3.5132575476382044e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 587/1024 [26:48:54<18:59:05, 156.40s/it][AINFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 588/1024 [26:51:53<19:44:51, 163.05s/it][A
+                                                         [A{'loss': 0.1165, 'grad_norm': 0.003520917845889926, 'learning_rate': 1e-05, 'num_tokens': 517929081.0, 'completions/mean_length': 7866.703125, 'completions/min_length': 49.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7222.5380859375, 'completions/min_terminated_length': 49.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3316730856895447, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01890747994184494, 'sampling/sampling_logp_difference/max': 9.684585571289062, 'sampling/importance_sampling_ratio/min': 6.223546370165423e-05, 'sampling/importance_sampling_ratio/mean': 0.9999421834945679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8133657574653625, 'clip_ratio/low_mean': 3.885528553837503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1935539368532773e-06, 'clip_ratio/high_max': 1.2774215747413109e-05, 'clip_ratio/region_mean': 4.204883930469805e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 588/1024 [26:51:53<19:44:51, 163.05s/it][AINFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 589/1024 [26:54:28<19:26:20, 160.87s/it][A
+                                                         [A{'loss': 0.0447, 'grad_norm': 0.0029796145390719175, 'learning_rate': 1e-05, 'num_tokens': 518810247.0, 'completions/mean_length': 6701.296875, 'completions/min_length': 24.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6547.603515625, 'completions/min_terminated_length': 24.0, 'completions/max_terminated_length': 15944.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2869499921798706, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01977725327014923, 'sampling/sampling_logp_difference/max': 22.101436614990234, 'sampling/importance_sampling_ratio/min': 2.520391673144218e-10, 'sampling/importance_sampling_ratio/mean': 0.9999505877494812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9360691756010056, 'clip_ratio/low_mean': 3.457626269209868e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7355519048578572e-06, 'clip_ratio/high_max': 6.942207619431429e-06, 'clip_ratio/region_mean': 3.631181459695654e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 589/1024 [26:54:28<19:26:20, 160.87s/it][AINFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 590/1024 [26:57:27<20:02:48, 166.29s/it][A
+                                                         [A{'loss': 0.0477, 'grad_norm': 0.0024249793495982885, 'learning_rate': 1e-05, 'num_tokens': 519730577.0, 'completions/mean_length': 7029.453125, 'completions/min_length': 1180.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6804.9443359375, 'completions/min_terminated_length': 1180.0, 'completions/max_terminated_length': 15971.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.22803518176078796, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01923082396388054, 'sampling/sampling_logp_difference/max': 15.630853652954102, 'sampling/importance_sampling_ratio/min': 1.6278204384434503e-07, 'sampling/importance_sampling_ratio/mean': 0.9999786615371704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9168537557125092, 'clip_ratio/low_mean': 3.738725240509666e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.476589184487239e-07, 'clip_ratio/high_max': 3.7906356737948954e-06, 'clip_ratio/region_mean': 3.8334911323545384e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 590/1024 [26:57:27<20:02:48, 166.29s/it][AINFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 591/1024 [27:00:27<20:29:00, 170.30s/it][A
+                                                         [A{'loss': 0.0821, 'grad_norm': 0.003160425927489996, 'learning_rate': 1e-05, 'num_tokens': 520680707.0, 'completions/mean_length': 7255.453125, 'completions/min_length': 832.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6646.8837890625, 'completions/min_terminated_length': 832.0, 'completions/max_terminated_length': 15600.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2461756467819214, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019255205988883972, 'sampling/sampling_logp_difference/max': 6.968714237213135, 'sampling/importance_sampling_ratio/min': 0.0009408618789166212, 'sampling/importance_sampling_ratio/mean': 1.0000334978103638, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8241118341684341, 'clip_ratio/low_mean': 3.2254738812298456e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.1899421552880085e-06, 'clip_ratio/high_max': 2.4759768621152034e-05, 'clip_ratio/region_mean': 3.8444680967586464e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 591/1024 [27:00:27<20:29:00, 170.30s/it][AINFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 592/1024 [27:03:11<20:11:47, 168.30s/it][A
+                                                         [A{'loss': 0.0267, 'grad_norm': 0.00411194609478116, 'learning_rate': 1e-05, 'num_tokens': 521703303.0, 'completions/mean_length': 7819.96875, 'completions/min_length': 512.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7752.53564453125, 'completions/min_terminated_length': 512.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022727783769369125, 'sampling/sampling_logp_difference/max': 7.937360763549805, 'sampling/importance_sampling_ratio/min': 0.0003571478300727904, 'sampling/importance_sampling_ratio/mean': 0.9999041557312012, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1218742430210114, 'clip_ratio/low_mean': 3.9836502310208743e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.864952139385423e-06, 'clip_ratio/high_max': 7.459808557541692e-06, 'clip_ratio/region_mean': 4.170145416537707e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 592/1024 [27:03:11<20:11:47, 168.30s/it][AINFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 593/1024 [27:05:48<19:44:37, 164.91s/it][A
+                                                         [A{'loss': 0.0339, 'grad_norm': 0.0022753921803086996, 'learning_rate': 1e-05, 'num_tokens': 522531422.0, 'completions/mean_length': 6322.8671875, 'completions/min_length': 637.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6163.1669921875, 'completions/min_terminated_length': 637.0, 'completions/max_terminated_length': 16117.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.20753081142902374, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01893780007958412, 'sampling/sampling_logp_difference/max': 12.124995231628418, 'sampling/importance_sampling_ratio/min': 5.422274170996388e-06, 'sampling/importance_sampling_ratio/mean': 0.9998952150344849, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8323960080742836, 'clip_ratio/low_mean': 3.738353416338214e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.714662395599589e-06, 'clip_ratio/high_max': 1.8858649582398357e-05, 'clip_ratio/region_mean': 4.2098196558981726e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 593/1024 [27:05:48<19:44:37, 164.91s/it][AINFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 594/1024 [27:08:45<20:09:31, 168.77s/it][A
+                                                         [A{'loss': -0.0134, 'grad_norm': 0.004338000901043415, 'learning_rate': 1e-05, 'num_tokens': 523453262.0, 'completions/mean_length': 7054.0625, 'completions/min_length': 101.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6905.96875, 'completions/min_terminated_length': 101.0, 'completions/max_terminated_length': 16055.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.13204573094844818, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.01982954889535904, 'sampling/sampling_logp_difference/max': 9.437154769897461, 'sampling/importance_sampling_ratio/min': 7.97068714746274e-05, 'sampling/importance_sampling_ratio/mean': 0.9998721480369568, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.866028867661953, 'clip_ratio/low_mean': 1.1187657776190463e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.943995564754005e-07, 'clip_ratio/high_max': 3.977598225901602e-06, 'clip_ratio/region_mean': 1.2182057332665863e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 594/1024 [27:08:45<20:09:31, 168.77s/it][AINFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 595/1024 [27:11:38<20:15:32, 170.00s/it][A
+                                                         [A{'loss': 0.0648, 'grad_norm': 0.003401415189728141, 'learning_rate': 1e-05, 'num_tokens': 524436831.0, 'completions/mean_length': 7539.0703125, 'completions/min_length': 446.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7027.3798828125, 'completions/min_terminated_length': 446.0, 'completions/max_terminated_length': 16361.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2511882185935974, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019884679466485977, 'sampling/sampling_logp_difference/max': 10.775017738342285, 'sampling/importance_sampling_ratio/min': 2.0915547793265432e-05, 'sampling/importance_sampling_ratio/mean': 0.999969482421875, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8601142391562462, 'clip_ratio/low_mean': 3.533169467573316e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7596285551444453e-06, 'clip_ratio/high_max': 1.5038514220577781e-05, 'clip_ratio/region_mean': 3.9091323742468376e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 595/1024 [27:11:38<20:15:32, 170.00s/it][AINFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 596/1024 [27:14:31<20:19:20, 170.93s/it][A
+                                                         [A{'loss': 0.0549, 'grad_norm': 0.002879115054383874, 'learning_rate': 1e-05, 'num_tokens': 525368091.0, 'completions/mean_length': 7137.96875, 'completions/min_length': 606.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6762.11376953125, 'completions/min_terminated_length': 606.0, 'completions/max_terminated_length': 16343.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.27062684297561646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01847894862294197, 'sampling/sampling_logp_difference/max': 7.680283546447754, 'sampling/importance_sampling_ratio/min': 0.0004618439415935427, 'sampling/importance_sampling_ratio/mean': 1.0000025033950806, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7909424379467964, 'clip_ratio/low_mean': 5.44505830930575e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.187421713046206e-06, 'clip_ratio/high_max': 2.9679867111553904e-05, 'clip_ratio/region_mean': 6.263800514716422e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 596/1024 [27:14:31<20:19:20, 170.93s/it][AINFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 597/1024 [27:17:02<19:32:19, 164.73s/it][A
+                                                         [A{'loss': 0.0549, 'grad_norm': 0.004545152187347412, 'learning_rate': 1e-05, 'num_tokens': 526095378.0, 'completions/mean_length': 5486.3671875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5224.82421875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16208.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.33508801460266113, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02017204463481903, 'sampling/sampling_logp_difference/max': 9.675474166870117, 'sampling/importance_sampling_ratio/min': 6.280510569922626e-05, 'sampling/importance_sampling_ratio/mean': 0.9998891353607178, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9588652476668358, 'clip_ratio/low_mean': 3.1269102407804894e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4413541293833987e-06, 'clip_ratio/high_max': 5.765416517533595e-06, 'clip_ratio/region_mean': 3.2710456423501455e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 597/1024 [27:17:02<19:32:19, 164.73s/it][AINFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 598/1024 [27:19:36<19:06:47, 161.52s/it][A
+                                                         [A{'loss': 0.0477, 'grad_norm': 0.004040954168885946, 'learning_rate': 1e-05, 'num_tokens': 526969459.0, 'completions/mean_length': 6636.0078125, 'completions/min_length': 685.0, 'completions/max_length': 16169.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6636.0078125, 'completions/min_terminated_length': 685.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02086419239640236, 'sampling/sampling_logp_difference/max': 17.61687469482422, 'sampling/importance_sampling_ratio/min': 2.2340275407373156e-08, 'sampling/importance_sampling_ratio/mean': 0.9999474287033081, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9497648254036903, 'clip_ratio/low_mean': 4.477498589494644e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.732241109195456e-06, 'clip_ratio/high_max': 1.519483475931338e-05, 'clip_ratio/region_mean': 4.950722734520241e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 598/1024 [27:19:36<19:06:47, 161.52s/it][AINFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 599/1024 [27:22:12<18:53:23, 160.01s/it][A
+                                                         [A{'loss': 0.1854, 'grad_norm': 0.004678349941968918, 'learning_rate': 1e-05, 'num_tokens': 527822197.0, 'completions/mean_length': 6462.953125, 'completions/min_length': 824.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6142.9189453125, 'completions/min_terminated_length': 824.0, 'completions/max_terminated_length': 15820.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3345640003681183, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019832316786050797, 'sampling/sampling_logp_difference/max': 10.463495254516602, 'sampling/importance_sampling_ratio/min': 2.8560234568431042e-05, 'sampling/importance_sampling_ratio/mean': 0.9997877478599548, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9401230812072754, 'clip_ratio/low_mean': 4.7215530003086315e-05, 'clip_ratio/low_min': 5.274039267533226e-06, 'clip_ratio/high_mean': 3.946291258216661e-06, 'clip_ratio/high_max': 1.5785165032866644e-05, 'clip_ratio/region_mean': 5.116182205711084e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 599/1024 [27:22:12<18:53:23, 160.01s/it][AINFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▊    | 600/1024 [27:24:53<18:52:54, 160.32s/it][A
+                                                         [A{'loss': 0.0668, 'grad_norm': 0.0014094997895881534, 'learning_rate': 1e-05, 'num_tokens': 528759458.0, 'completions/mean_length': 7172.1015625, 'completions/min_length': 1079.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6951.01611328125, 'completions/min_terminated_length': 1079.0, 'completions/max_terminated_length': 15170.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.16834919154644012, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018519852310419083, 'sampling/sampling_logp_difference/max': 6.621304035186768, 'sampling/importance_sampling_ratio/min': 0.001331693259999156, 'sampling/importance_sampling_ratio/mean': 0.9999281167984009, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7962061613798141, 'clip_ratio/low_mean': 4.795687004843785e-05, 'clip_ratio/low_min': 7.76807610236574e-06, 'clip_ratio/high_mean': 1.0353853667766089e-06, 'clip_ratio/high_max': 4.1415414671064354e-06, 'clip_ratio/region_mean': 4.899225518784078e-05, 'epoch': 0.55}
+
+ 59%|█████▊    | 600/1024 [27:24:53<18:52:54, 160.32s/it][AINFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▊    | 601/1024 [27:27:37<18:57:19, 161.32s/it][A
+                                                         [A{'loss': 0.0561, 'grad_norm': 0.0038943374529480934, 'learning_rate': 1e-05, 'num_tokens': 529626893.0, 'completions/mean_length': 6612.6484375, 'completions/min_length': 480.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6378.13623046875, 'completions/min_terminated_length': 480.0, 'completions/max_terminated_length': 16195.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018441151827573776, 'sampling/sampling_logp_difference/max': 6.01370906829834, 'sampling/importance_sampling_ratio/min': 0.0024450027849525213, 'sampling/importance_sampling_ratio/mean': 0.9999620914459229, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8218385726213455, 'clip_ratio/low_mean': 5.2064756346226204e-05, 'clip_ratio/low_min': 5.341652013157727e-06, 'clip_ratio/high_mean': 3.018199095095042e-06, 'clip_ratio/high_max': 7.3846517807396594e-06, 'clip_ratio/region_mean': 5.5082955441321246e-05, 'epoch': 0.55}
+
+ 59%|█████▊    | 601/1024 [27:27:37<18:57:19, 161.32s/it][AINFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 602/1024 [27:30:25<19:08:59, 163.36s/it][A
+                                                         [A{'loss': 0.0577, 'grad_norm': 0.0027088895440101624, 'learning_rate': 1e-05, 'num_tokens': 530486578.0, 'completions/mean_length': 6574.9140625, 'completions/min_length': 371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6419.21484375, 'completions/min_terminated_length': 371.0, 'completions/max_terminated_length': 15898.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020115964114665985, 'sampling/sampling_logp_difference/max': 11.352873802185059, 'sampling/importance_sampling_ratio/min': 1.1735714906535577e-05, 'sampling/importance_sampling_ratio/mean': 1.000026822090149, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9268836230039597, 'clip_ratio/low_mean': 4.8717710285473004e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0524913679764722e-06, 'clip_ratio/high_max': 8.209965471905889e-06, 'clip_ratio/region_mean': 5.077020244925734e-05, 'epoch': 0.55}
+
+ 59%|█████▉    | 602/1024 [27:30:25<19:08:59, 163.36s/it][AINFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 603/1024 [27:32:53<18:33:54, 158.75s/it][A
+                                                         [A{'loss': 0.0461, 'grad_norm': 0.002628365531563759, 'learning_rate': 1e-05, 'num_tokens': 531303083.0, 'completions/mean_length': 6209.1953125, 'completions/min_length': 598.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6129.07861328125, 'completions/min_terminated_length': 598.0, 'completions/max_terminated_length': 14361.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.13098490238189697, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.019658785313367844, 'sampling/sampling_logp_difference/max': 10.461148262023926, 'sampling/importance_sampling_ratio/min': 2.862734254449606e-05, 'sampling/importance_sampling_ratio/mean': 0.9998608827590942, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9574517607688904, 'clip_ratio/low_mean': 1.3909025255998131e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.31241858980502e-06, 'clip_ratio/high_max': 5.24967435922008e-06, 'clip_ratio/region_mean': 1.5221443845803151e-05, 'epoch': 0.55}
+
+ 59%|█████▉    | 603/1024 [27:32:53<18:33:54, 158.75s/it][AINFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 604/1024 [27:35:45<18:59:29, 162.78s/it][A
+                                                         [A{'loss': 0.0285, 'grad_norm': 0.004664157051593065, 'learning_rate': 1e-05, 'num_tokens': 532228227.0, 'completions/mean_length': 7079.1875, 'completions/min_length': 1015.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6855.87255859375, 'completions/min_terminated_length': 1015.0, 'completions/max_terminated_length': 13873.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.30327796936035156, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018260695040225983, 'sampling/sampling_logp_difference/max': 14.43586540222168, 'sampling/importance_sampling_ratio/min': 5.377535785555665e-07, 'sampling/importance_sampling_ratio/mean': 0.9999879598617554, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.853938102722168, 'clip_ratio/low_mean': 4.9158792762682424e-05, 'clip_ratio/low_min': 4.514427928370424e-06, 'clip_ratio/high_mean': 4.753649363919976e-06, 'clip_ratio/high_max': 1.9014597455679905e-05, 'clip_ratio/region_mean': 5.39124412171077e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 604/1024 [27:35:45<18:59:29, 162.78s/it][AINFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 605/1024 [27:38:02<18:02:44, 155.05s/it][A
+                                                         [A{'loss': 0.0168, 'grad_norm': 0.004579839296638966, 'learning_rate': 1e-05, 'num_tokens': 533024264.0, 'completions/mean_length': 6071.5390625, 'completions/min_length': 742.0, 'completions/max_length': 15094.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6071.5390625, 'completions/min_terminated_length': 742.0, 'completions/max_terminated_length': 15094.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.30327799916267395, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01974770799279213, 'sampling/sampling_logp_difference/max': 7.989465236663818, 'sampling/importance_sampling_ratio/min': 0.0003390153287909925, 'sampling/importance_sampling_ratio/mean': 0.999982476234436, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.980722151696682, 'clip_ratio/low_mean': 2.1738228269896354e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.563708891211718e-06, 'clip_ratio/high_max': 3.025483556484687e-05, 'clip_ratio/region_mean': 2.9301936820047558e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 605/1024 [27:38:02<18:02:44, 155.05s/it][AINFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 606/1024 [27:40:58<18:44:16, 161.38s/it][A
+                                                         [A{'loss': 0.0576, 'grad_norm': 0.002537919208407402, 'learning_rate': 1e-05, 'num_tokens': 533985318.0, 'completions/mean_length': 7352.484375, 'completions/min_length': 1310.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7209.12744140625, 'completions/min_terminated_length': 1310.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018647275865077972, 'sampling/sampling_logp_difference/max': 6.329618453979492, 'sampling/importance_sampling_ratio/min': 0.0017827138071879745, 'sampling/importance_sampling_ratio/mean': 0.9999037981033325, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7858814746141434, 'clip_ratio/low_mean': 5.142044130934664e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.773990667672479e-06, 'clip_ratio/high_max': 1.3344870239961892e-05, 'clip_ratio/region_mean': 5.6194432318079635e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 606/1024 [27:40:58<18:44:16, 161.38s/it][AINFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 607/1024 [27:44:01<19:26:07, 167.79s/it][A
+                                                         [A{'loss': 0.0648, 'grad_norm': 0.0037982286885380745, 'learning_rate': 1e-05, 'num_tokens': 534912558.0, 'completions/mean_length': 7095.1875, 'completions/min_length': 1073.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6947.74658203125, 'completions/min_terminated_length': 1073.0, 'completions/max_terminated_length': 16082.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01693977229297161, 'sampling/sampling_logp_difference/max': 9.422355651855469, 'sampling/importance_sampling_ratio/min': 8.089523180387914e-05, 'sampling/importance_sampling_ratio/mean': 0.9999147057533264, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6846291124820709, 'clip_ratio/low_mean': 4.466222731025482e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.0977013137962786e-06, 'clip_ratio/high_max': 2.345925531699322e-05, 'clip_ratio/region_mean': 5.175992941985896e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 607/1024 [27:44:01<19:26:07, 167.79s/it][AINFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 608/1024 [27:46:49<19:22:58, 167.74s/it][A
+                                                         [A{'loss': 0.0716, 'grad_norm': 0.0030545955523848534, 'learning_rate': 1e-05, 'num_tokens': 535707127.0, 'completions/mean_length': 6038.1953125, 'completions/min_length': 677.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5873.9765625, 'completions/min_terminated_length': 677.0, 'completions/max_terminated_length': 15572.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3243142366409302, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018705151975154877, 'sampling/sampling_logp_difference/max': 8.624987602233887, 'sampling/importance_sampling_ratio/min': 0.00017956242663785815, 'sampling/importance_sampling_ratio/mean': 0.9999387264251709, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8637901693582535, 'clip_ratio/low_mean': 6.557838094067847e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2919629170937696e-06, 'clip_ratio/high_max': 5.167851668375079e-06, 'clip_ratio/region_mean': 6.687034363039857e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 608/1024 [27:46:49<19:22:58, 167.74s/it][AINFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 609/1024 [27:54:58<19:15:18, 167.03s/it][A
+                                                         [A{'loss': 0.0698, 'grad_norm': 0.002951717935502529, 'learning_rate': 1e-05, 'num_tokens': 536618376.0, 'completions/mean_length': 6978.0078125, 'completions/min_length': 69.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6828.70654296875, 'completions/min_terminated_length': 69.0, 'completions/max_terminated_length': 14906.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3527044355869293, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018486514687538147, 'sampling/sampling_logp_difference/max': 10.160879135131836, 'sampling/importance_sampling_ratio/min': 3.865327380481176e-05, 'sampling/importance_sampling_ratio/mean': 0.9999598264694214, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7931060045957565, 'clip_ratio/low_mean': 5.012885230826214e-05, 'clip_ratio/low_min': 3.5653165468829684e-06, 'clip_ratio/high_mean': 5.544901910070621e-06, 'clip_ratio/high_max': 1.7691760149318725e-05, 'clip_ratio/region_mean': 5.5673754559393274e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 609/1024 [27:54:58<19:15:18, 167.03s/it][AINFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|█████▉    | 610/1024 [27:57:42<30:03:20, 261.35s/it][A
+                                                         [A{'loss': 0.0973, 'grad_norm': 0.0019385438645258546, 'learning_rate': 1e-05, 'num_tokens': 537513876.0, 'completions/mean_length': 6810.15625, 'completions/min_length': 477.0, 'completions/max_length': 15329.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6810.15625, 'completions/min_terminated_length': 477.0, 'completions/max_terminated_length': 15329.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.28011518716812134, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02021351456642151, 'sampling/sampling_logp_difference/max': 9.934880256652832, 'sampling/importance_sampling_ratio/min': 4.845474904868752e-05, 'sampling/importance_sampling_ratio/mean': 1.000025749206543, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8957240954041481, 'clip_ratio/low_mean': 6.101864732954709e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.397787731453718e-06, 'clip_ratio/high_max': 2.1591150925814873e-05, 'clip_ratio/region_mean': 6.6416435629435e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 610/1024 [27:57:42<30:03:20, 261.35s/it][AINFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|█████▉    | 611/1024 [28:00:36<27:13:03, 237.25s/it][A
+                                                         [A{'loss': 0.0319, 'grad_norm': 0.001886329147964716, 'learning_rate': 1e-05, 'num_tokens': 538419265.0, 'completions/mean_length': 6940.4140625, 'completions/min_length': 370.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6713.7685546875, 'completions/min_terminated_length': 370.0, 'completions/max_terminated_length': 16065.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.19568344950675964, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019072774797677994, 'sampling/sampling_logp_difference/max': 14.18748950958252, 'sampling/importance_sampling_ratio/min': 6.893687327647058e-07, 'sampling/importance_sampling_ratio/mean': 1.0000052452087402, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8646975234150887, 'clip_ratio/low_mean': 1.2616926369446446e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.914362077419355e-06, 'clip_ratio/high_max': 1.4817902865615906e-05, 'clip_ratio/region_mean': 1.8531288333178964e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 611/1024 [28:00:36<27:13:03, 237.25s/it][AINFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|█████▉    | 612/1024 [28:03:47<25:33:16, 223.29s/it][A
+                                                         [A{'loss': 0.0335, 'grad_norm': 0.002031022449955344, 'learning_rate': 1e-05, 'num_tokens': 539399127.0, 'completions/mean_length': 7508.796875, 'completions/min_length': 607.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6995.35498046875, 'completions/min_terminated_length': 607.0, 'completions/max_terminated_length': 15960.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2301519513130188, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01832709088921547, 'sampling/sampling_logp_difference/max': 5.177490234375, 'sampling/importance_sampling_ratio/min': 0.0056421491317451, 'sampling/importance_sampling_ratio/mean': 0.9999816417694092, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7723299860954285, 'clip_ratio/low_mean': 3.254086982451554e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5920325040497119e-06, 'clip_ratio/high_max': 6.3681300161988474e-06, 'clip_ratio/region_mean': 3.4132902555938927e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 612/1024 [28:03:47<25:33:16, 223.29s/it][AINFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|█████▉    | 613/1024 [28:06:24<23:12:15, 203.25s/it][A
+                                                         [A{'loss': 0.1072, 'grad_norm': 0.003653773572295904, 'learning_rate': 1e-05, 'num_tokens': 540189602.0, 'completions/mean_length': 6019.6484375, 'completions/min_length': 1020.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5938.03955078125, 'completions/min_terminated_length': 1020.0, 'completions/max_terminated_length': 15816.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.26143303513526917, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017161473631858826, 'sampling/sampling_logp_difference/max': 5.242223262786865, 'sampling/importance_sampling_ratio/min': 0.005288486368954182, 'sampling/importance_sampling_ratio/mean': 0.9999122619628906, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7425512671470642, 'clip_ratio/low_mean': 2.6742804038804024e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9622444774067844e-06, 'clip_ratio/high_max': 1.5848977909627138e-05, 'clip_ratio/region_mean': 3.070504851621081e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 613/1024 [28:06:24<23:12:15, 203.25s/it][AINFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|█████▉    | 614/1024 [28:09:22<22:18:39, 195.90s/it][A
+                                                         [A{'loss': 0.0346, 'grad_norm': 0.003739065257832408, 'learning_rate': 1e-05, 'num_tokens': 541125587.0, 'completions/mean_length': 7155.6953125, 'completions/min_length': 987.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6621.826171875, 'completions/min_terminated_length': 987.0, 'completions/max_terminated_length': 15861.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02008877694606781, 'sampling/sampling_logp_difference/max': 11.59233570098877, 'sampling/importance_sampling_ratio/min': 9.236609002982732e-06, 'sampling/importance_sampling_ratio/mean': 0.9999271631240845, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9789249897003174, 'clip_ratio/low_mean': 3.428678644468164e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.754297725005017e-06, 'clip_ratio/high_max': 1.1017190900020069e-05, 'clip_ratio/region_mean': 3.7041084169686656e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 614/1024 [28:09:22<22:18:39, 195.90s/it][AINFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 615/1024 [28:12:21<21:40:21, 190.76s/it][A
+                                                         [A{'loss': 0.0524, 'grad_norm': 0.0020656392443925142, 'learning_rate': 1e-05, 'num_tokens': 542173801.0, 'completions/mean_length': 8027.359375, 'completions/min_length': 248.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7470.25048828125, 'completions/min_terminated_length': 248.0, 'completions/max_terminated_length': 13553.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.22225633263587952, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021495234221220016, 'sampling/sampling_logp_difference/max': 8.124446868896484, 'sampling/importance_sampling_ratio/min': 0.00029620854184031487, 'sampling/importance_sampling_ratio/mean': 0.999947190284729, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9153474718332291, 'clip_ratio/low_mean': 4.249646542575647e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4549021873099264e-06, 'clip_ratio/high_max': 5.6091539590852335e-06, 'clip_ratio/region_mean': 4.4951367613066395e-05, 'epoch': 0.57}
+
+ 60%|██████    | 615/1024 [28:12:21<21:40:21, 190.76s/it][AINFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 616/1024 [28:14:56<20:23:49, 179.97s/it][A
+                                                         [A{'loss': 0.0648, 'grad_norm': 0.00824788399040699, 'learning_rate': 1e-05, 'num_tokens': 542977266.0, 'completions/mean_length': 6115.3828125, 'completions/min_length': 1158.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5952.38916015625, 'completions/min_terminated_length': 1158.0, 'completions/max_terminated_length': 15879.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.30616888403892517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017732972279191017, 'sampling/sampling_logp_difference/max': 6.622807502746582, 'sampling/importance_sampling_ratio/min': 0.0013296925462782383, 'sampling/importance_sampling_ratio/mean': 0.9999478459358215, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.751783661544323, 'clip_ratio/low_mean': 5.2193488272678223e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.325646500547009e-06, 'clip_ratio/high_max': 1.7302586002188036e-05, 'clip_ratio/region_mean': 5.6519134659538395e-05, 'epoch': 0.57}
+
+ 60%|██████    | 616/1024 [28:14:56<20:23:49, 179.97s/it][AINFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 617/1024 [28:17:53<20:15:23, 179.17s/it][A
+                                                         [A{'loss': 0.0613, 'grad_norm': 0.005189655348658562, 'learning_rate': 1e-05, 'num_tokens': 543947515.0, 'completions/mean_length': 7431.3203125, 'completions/min_length': 738.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7142.52392578125, 'completions/min_terminated_length': 738.0, 'completions/max_terminated_length': 15688.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.21595832705497742, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02111673541367054, 'sampling/sampling_logp_difference/max': 8.644620895385742, 'sampling/importance_sampling_ratio/min': 0.00017607140762265772, 'sampling/importance_sampling_ratio/mean': 0.9999845623970032, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9122852608561516, 'clip_ratio/low_mean': 5.301810256241879e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.185486876755022e-06, 'clip_ratio/high_max': 2.872588265745435e-05, 'clip_ratio/region_mean': 6.120358921180014e-05, 'epoch': 0.57}
+
+ 60%|██████    | 617/1024 [28:17:53<20:15:23, 179.17s/it][AINFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 618/1024 [28:20:33<19:33:45, 173.46s/it][A
+                                                         [A{'loss': 0.0773, 'grad_norm': 0.004707770887762308, 'learning_rate': 1e-05, 'num_tokens': 544694826.0, 'completions/mean_length': 5700.5546875, 'completions/min_length': 727.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5530.9765625, 'completions/min_terminated_length': 727.0, 'completions/max_terminated_length': 16378.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3366856575012207, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018697837367653847, 'sampling/sampling_logp_difference/max': 21.374990463256836, 'sampling/importance_sampling_ratio/min': 5.211461817644647e-10, 'sampling/importance_sampling_ratio/mean': 0.9998490214347839, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8961661159992218, 'clip_ratio/low_mean': 3.414959587644262e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.961746627595858e-07, 'clip_ratio/high_max': 3.984698651038343e-06, 'clip_ratio/region_mean': 3.514577088026272e-05, 'epoch': 0.57}
+
+ 60%|██████    | 618/1024 [28:20:33<19:33:45, 173.46s/it][AINFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 619/1024 [28:22:21<17:17:44, 153.74s/it][A
+                                                         [A{'loss': 0.0492, 'grad_norm': 0.00980924628674984, 'learning_rate': 1e-05, 'num_tokens': 545255377.0, 'completions/mean_length': 4201.6796875, 'completions/min_length': 436.0, 'completions/max_length': 12422.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4201.6796875, 'completions/min_terminated_length': 436.0, 'completions/max_terminated_length': 12422.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.38664889335632324, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016301468014717102, 'sampling/sampling_logp_difference/max': 9.455235481262207, 'sampling/importance_sampling_ratio/min': 7.827866647858173e-05, 'sampling/importance_sampling_ratio/mean': 1.000074028968811, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7066933363676071, 'clip_ratio/low_mean': 5.229935004535946e-05, 'clip_ratio/low_min': 4.098226327187149e-06, 'clip_ratio/high_mean': 2.9524303499783855e-06, 'clip_ratio/high_max': 1.1809721399913542e-05, 'clip_ratio/region_mean': 5.525178062271152e-05, 'epoch': 0.57}
+
+ 60%|██████    | 619/1024 [28:22:21<17:17:44, 153.74s/it][AINFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 620/1024 [28:24:50<17:06:05, 152.39s/it][A
+                                                         [A{'loss': 0.077, 'grad_norm': 0.005619424395263195, 'learning_rate': 1e-05, 'num_tokens': 546013882.0, 'completions/mean_length': 5782.2578125, 'completions/min_length': 434.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5613.9765625, 'completions/min_terminated_length': 434.0, 'completions/max_terminated_length': 13234.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2472364753484726, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018704919144511223, 'sampling/sampling_logp_difference/max': 9.267168045043945, 'sampling/importance_sampling_ratio/min': 9.447568299947307e-05, 'sampling/importance_sampling_ratio/mean': 1.0000319480895996, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.846621498465538, 'clip_ratio/low_mean': 1.853809601470857e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5232756140903803e-06, 'clip_ratio/high_max': 6.093102456361521e-06, 'clip_ratio/region_mean': 2.0061371856172627e-05, 'epoch': 0.57}
+
+ 61%|██████    | 620/1024 [28:24:50<17:06:05, 152.39s/it][AINFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 621/1024 [28:27:24<17:06:56, 152.90s/it][A
+                                                         [A{'loss': 0.0964, 'grad_norm': 0.0063271005637943745, 'learning_rate': 1e-05, 'num_tokens': 546954857.0, 'completions/mean_length': 7191.4921875, 'completions/min_length': 1379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7045.57958984375, 'completions/min_terminated_length': 1379.0, 'completions/max_terminated_length': 15569.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01846012845635414, 'sampling/sampling_logp_difference/max': 5.062449932098389, 'sampling/importance_sampling_ratio/min': 0.006330032367259264, 'sampling/importance_sampling_ratio/mean': 0.9999164342880249, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7846563309431076, 'clip_ratio/low_mean': 4.008232758678787e-05, 'clip_ratio/low_min': 3.511630438879365e-06, 'clip_ratio/high_mean': 4.186933551864058e-06, 'clip_ratio/high_max': 1.6747734207456233e-05, 'clip_ratio/region_mean': 4.426926193445979e-05, 'epoch': 0.57}
+
+ 61%|██████    | 621/1024 [28:27:24<17:06:56, 152.90s/it][AINFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 622/1024 [28:29:32<16:13:51, 145.35s/it][A
+                                                         [A{'loss': 0.1013, 'grad_norm': 0.005836677737534046, 'learning_rate': 1e-05, 'num_tokens': 547676024.0, 'completions/mean_length': 5491.7421875, 'completions/min_length': 1644.0, 'completions/max_length': 15529.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5491.7421875, 'completions/min_terminated_length': 1644.0, 'completions/max_terminated_length': 15529.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.43213340640068054, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016565188765525818, 'sampling/sampling_logp_difference/max': 7.7476348876953125, 'sampling/importance_sampling_ratio/min': 0.00043176248436793685, 'sampling/importance_sampling_ratio/mean': 0.999930739402771, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6960643380880356, 'clip_ratio/low_mean': 5.253966105556174e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2486661603361426e-05, 'clip_ratio/high_max': 3.451678094279487e-05, 'clip_ratio/region_mean': 6.502632390947838e-05, 'epoch': 0.57}
+
+ 61%|██████    | 622/1024 [28:29:32<16:13:51, 145.35s/it][AINFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 623/1024 [28:32:28<17:11:38, 154.36s/it][A
+                                                         [A{'loss': 0.0326, 'grad_norm': 0.00226933928206563, 'learning_rate': 1e-05, 'num_tokens': 548590080.0, 'completions/mean_length': 6993.125, 'completions/min_length': 980.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6844.06396484375, 'completions/min_terminated_length': 980.0, 'completions/max_terminated_length': 16179.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.19332444667816162, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01880657486617565, 'sampling/sampling_logp_difference/max': 13.68293285369873, 'sampling/importance_sampling_ratio/min': 1.1417677114877733e-06, 'sampling/importance_sampling_ratio/mean': 1.000011682510376, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8031502217054367, 'clip_ratio/low_mean': 3.0399249226320535e-05, 'clip_ratio/low_min': 5.838393462909153e-06, 'clip_ratio/high_mean': 1.079745743481908e-06, 'clip_ratio/high_max': 4.318982973927632e-06, 'clip_ratio/region_mean': 3.147899496980244e-05, 'epoch': 0.57}
+
+ 61%|██████    | 623/1024 [28:32:28<17:11:38, 154.36s/it][AINFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 624/1024 [28:34:45<16:34:24, 149.16s/it][A
+                                                         [A{'loss': 0.0394, 'grad_norm': 0.005067484453320503, 'learning_rate': 1e-05, 'num_tokens': 549327251.0, 'completions/mean_length': 5602.8359375, 'completions/min_length': 100.0, 'completions/max_length': 15278.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5602.8359375, 'completions/min_terminated_length': 100.0, 'completions/max_terminated_length': 15278.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.35218530893325806, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018545404076576233, 'sampling/sampling_logp_difference/max': 5.624884605407715, 'sampling/importance_sampling_ratio/min': 0.0036069792695343494, 'sampling/importance_sampling_ratio/mean': 0.9999701380729675, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8287182524800301, 'clip_ratio/low_mean': 4.231840989632474e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.101052132275072e-06, 'clip_ratio/high_max': 8.404208529100288e-06, 'clip_ratio/region_mean': 4.441946202859981e-05, 'epoch': 0.57}
+
+ 61%|██████    | 624/1024 [28:34:45<16:34:24, 149.16s/it][AINFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 625/1024 [28:37:29<17:01:25, 153.60s/it][A
+                                                         [A{'loss': 0.0064, 'grad_norm': 0.0023132911883294582, 'learning_rate': 1e-05, 'num_tokens': 550208750.0, 'completions/mean_length': 6747.0234375, 'completions/min_length': 879.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6671.1416015625, 'completions/min_terminated_length': 879.0, 'completions/max_terminated_length': 15901.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.30904704332351685, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019216621294617653, 'sampling/sampling_logp_difference/max': 5.592033386230469, 'sampling/importance_sampling_ratio/min': 0.003727440955117345, 'sampling/importance_sampling_ratio/mean': 0.9999475479125977, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8722762316465378, 'clip_ratio/low_mean': 4.6288066641864134e-05, 'clip_ratio/low_min': 5.32640206074575e-06, 'clip_ratio/high_mean': 1.8743556893241475e-06, 'clip_ratio/high_max': 7.49742275729659e-06, 'clip_ratio/region_mean': 4.816242244487512e-05, 'epoch': 0.57}
+
+ 61%|██████    | 625/1024 [28:37:29<17:01:25, 153.60s/it][AINFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 626/1024 [28:40:10<17:13:31, 155.81s/it][A
+                                                         [A{'loss': 0.0905, 'grad_norm': 0.0036700034979730844, 'learning_rate': 1e-05, 'num_tokens': 551123002.0, 'completions/mean_length': 6983.40625, 'completions/min_length': 385.0, 'completions/max_length': 16027.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6983.40625, 'completions/min_terminated_length': 385.0, 'completions/max_terminated_length': 16027.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2419992983341217, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019318291917443275, 'sampling/sampling_logp_difference/max': 9.8963041305542, 'sampling/importance_sampling_ratio/min': 5.0360464229015633e-05, 'sampling/importance_sampling_ratio/mean': 0.9999868273735046, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8781512826681137, 'clip_ratio/low_mean': 6.517495285152108e-05, 'clip_ratio/low_min': 1.1217302017030306e-05, 'clip_ratio/high_mean': 1.923391891978099e-06, 'clip_ratio/high_max': 7.693567567912396e-06, 'clip_ratio/region_mean': 6.709834497087286e-05, 'epoch': 0.58}
+
+ 61%|██████    | 626/1024 [28:40:10<17:13:31, 155.81s/it][AINFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 627/1024 [28:43:06<17:52:14, 162.05s/it][A
+                                                         [A{'loss': 0.0268, 'grad_norm': 0.0036717690527439117, 'learning_rate': 1e-05, 'num_tokens': 552055472.0, 'completions/mean_length': 7143.671875, 'completions/min_length': 451.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6689.22900390625, 'completions/min_terminated_length': 451.0, 'completions/max_terminated_length': 16201.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2212003767490387, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018518533557653427, 'sampling/sampling_logp_difference/max': 9.0, 'sampling/importance_sampling_ratio/min': 0.00012340980174485594, 'sampling/importance_sampling_ratio/mean': 0.9998798966407776, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7715872526168823, 'clip_ratio/low_mean': 5.9073974398415885e-05, 'clip_ratio/low_min': 6.781316187698394e-06, 'clip_ratio/high_mean': 1.2745738331432221e-06, 'clip_ratio/high_max': 5.098295332572889e-06, 'clip_ratio/region_mean': 6.034854845893278e-05, 'epoch': 0.58}
+
+ 61%|██████    | 627/1024 [28:43:06<17:52:14, 162.05s/it][AINFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████▏   | 628/1024 [28:46:00<18:13:29, 165.68s/it][A
+                                                         [A{'loss': 0.0494, 'grad_norm': 0.0019187588477507234, 'learning_rate': 1e-05, 'num_tokens': 552914275.0, 'completions/mean_length': 6558.5859375, 'completions/min_length': 1061.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6075.36865234375, 'completions/min_terminated_length': 1061.0, 'completions/max_terminated_length': 15729.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2041158676147461, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01948089525103569, 'sampling/sampling_logp_difference/max': 9.07090950012207, 'sampling/importance_sampling_ratio/min': 0.00011496193474158645, 'sampling/importance_sampling_ratio/mean': 0.9999418258666992, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9016438648104668, 'clip_ratio/low_mean': 2.460010267668622e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.4468678197517875e-06, 'clip_ratio/high_max': 1.778747127900715e-05, 'clip_ratio/region_mean': 2.9046970439594588e-05, 'epoch': 0.58}
+
+ 61%|██████▏   | 628/1024 [28:46:00<18:13:29, 165.68s/it][AINFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████▏   | 629/1024 [28:48:35<17:49:37, 162.48s/it][A
+                                                         [A{'loss': 0.0907, 'grad_norm': 0.003598993644118309, 'learning_rate': 1e-05, 'num_tokens': 553719958.0, 'completions/mean_length': 6150.2734375, 'completions/min_length': 596.0, 'completions/max_length': 15812.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6150.2734375, 'completions/min_terminated_length': 596.0, 'completions/max_terminated_length': 15812.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3022220730781555, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019557828083634377, 'sampling/sampling_logp_difference/max': 7.093727111816406, 'sampling/importance_sampling_ratio/min': 0.000830297009088099, 'sampling/importance_sampling_ratio/mean': 0.9999948740005493, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8385711833834648, 'clip_ratio/low_mean': 4.3287541757308645e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4582062653498724e-06, 'clip_ratio/high_max': 1.383282506139949e-05, 'clip_ratio/region_mean': 4.674574802265852e-05, 'epoch': 0.58}
+
+ 61%|██████▏   | 629/1024 [28:48:35<17:49:37, 162.48s/it][AINFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 630/1024 [28:51:46<18:43:12, 171.05s/it][A
+                                                         [A{'loss': 0.0261, 'grad_norm': 0.002453390508890152, 'learning_rate': 1e-05, 'num_tokens': 554784458.0, 'completions/mean_length': 8142.46875, 'completions/min_length': 1828.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7519.16015625, 'completions/min_terminated_length': 1828.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.1422954648733139, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.019445519894361496, 'sampling/sampling_logp_difference/max': 8.498891830444336, 'sampling/importance_sampling_ratio/min': 0.0002036939695244655, 'sampling/importance_sampling_ratio/mean': 0.9999715089797974, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8508284538984299, 'clip_ratio/low_mean': 1.7461135655594262e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.672075301139557e-07, 'clip_ratio/high_max': 2.668830120455823e-06, 'clip_ratio/region_mean': 1.8128343185708218e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 630/1024 [28:51:46<18:43:12, 171.05s/it][AINFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 631/1024 [28:54:36<18:37:35, 170.62s/it][A
+                                                         [A{'loss': 0.0245, 'grad_norm': 0.0027936683036386967, 'learning_rate': 1e-05, 'num_tokens': 555783296.0, 'completions/mean_length': 7665.921875, 'completions/min_length': 791.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7384.693359375, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 16109.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.24435830116271973, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01912892609834671, 'sampling/sampling_logp_difference/max': 8.187341690063477, 'sampling/importance_sampling_ratio/min': 0.0002781523216981441, 'sampling/importance_sampling_ratio/mean': 0.9998488426208496, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7667205557227135, 'clip_ratio/low_mean': 3.1556500402984966e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.750615062221186e-06, 'clip_ratio/high_max': 1.9002460248884745e-05, 'clip_ratio/region_mean': 3.630711614732718e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 631/1024 [28:54:36<18:37:35, 170.62s/it][AINFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 632/1024 [28:57:25<18:30:49, 170.03s/it][A
+                                                         [A{'loss': 0.1028, 'grad_norm': 0.004213637672364712, 'learning_rate': 1e-05, 'num_tokens': 556732942.0, 'completions/mean_length': 7266.171875, 'completions/min_length': 1117.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6972.04833984375, 'completions/min_terminated_length': 1117.0, 'completions/max_terminated_length': 16379.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3135277032852173, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01689826510846615, 'sampling/sampling_logp_difference/max': 13.249999046325684, 'sampling/importance_sampling_ratio/min': 1.760348027346481e-06, 'sampling/importance_sampling_ratio/mean': 0.9999159574508667, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7114122956991196, 'clip_ratio/low_mean': 3.8605214058407e-05, 'clip_ratio/low_min': 6.2870940382708795e-06, 'clip_ratio/high_mean': 3.8924990235500445e-06, 'clip_ratio/high_max': 1.5569996094200178e-05, 'clip_ratio/region_mean': 4.249771222930576e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 632/1024 [28:57:25<18:30:49, 170.03s/it][AINFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 633/1024 [29:00:12<18:23:09, 169.28s/it][A
+                                                         [A{'loss': 0.0406, 'grad_norm': 0.004169877618551254, 'learning_rate': 1e-05, 'num_tokens': 557589141.0, 'completions/mean_length': 6532.9921875, 'completions/min_length': 757.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6296.568359375, 'completions/min_terminated_length': 757.0, 'completions/max_terminated_length': 16054.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2675113081932068, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018738210201263428, 'sampling/sampling_logp_difference/max': 12.311498641967773, 'sampling/importance_sampling_ratio/min': 4.499705482885474e-06, 'sampling/importance_sampling_ratio/mean': 0.9999022483825684, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7711968123912811, 'clip_ratio/low_mean': 3.640393322257296e-05, 'clip_ratio/low_min': 3.0146634344419e-06, 'clip_ratio/high_mean': 5.434466118003911e-06, 'clip_ratio/high_max': 2.1737864472015644e-05, 'clip_ratio/region_mean': 4.183839985216764e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 633/1024 [29:00:12<18:23:09, 169.28s/it][AINFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 634/1024 [29:03:07<18:31:43, 171.03s/it][A
+                                                         [A{'loss': 0.0565, 'grad_norm': 0.0032470994628965855, 'learning_rate': 1e-05, 'num_tokens': 558557286.0, 'completions/mean_length': 7384.3203125, 'completions/min_length': 87.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7168.328125, 'completions/min_terminated_length': 87.0, 'completions/max_terminated_length': 16337.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019018521532416344, 'sampling/sampling_logp_difference/max': 8.535643577575684, 'sampling/importance_sampling_ratio/min': 0.00019634375348687172, 'sampling/importance_sampling_ratio/mean': 0.9999680519104004, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8054972141981125, 'clip_ratio/low_mean': 6.070675681257853e-05, 'clip_ratio/low_min': 5.175126261747209e-06, 'clip_ratio/high_mean': 1.5248809290824283e-06, 'clip_ratio/high_max': 6.099523716329713e-06, 'clip_ratio/region_mean': 6.223163745744387e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 634/1024 [29:03:07<18:31:43, 171.03s/it][AINFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 635/1024 [29:05:40<17:53:48, 165.63s/it][A
+                                                         [A{'loss': 0.1247, 'grad_norm': 0.004848263692110777, 'learning_rate': 1e-05, 'num_tokens': 559364639.0, 'completions/mean_length': 6131.9453125, 'completions/min_length': 820.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6051.22021484375, 'completions/min_terminated_length': 820.0, 'completions/max_terminated_length': 15918.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018360167741775513, 'sampling/sampling_logp_difference/max': 12.124655723571777, 'sampling/importance_sampling_ratio/min': 5.424115443020128e-06, 'sampling/importance_sampling_ratio/mean': 1.000056266784668, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8365718051791191, 'clip_ratio/low_mean': 3.798940008437057e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1092134911905305e-05, 'clip_ratio/high_max': 4.436853964762122e-05, 'clip_ratio/region_mean': 4.908153437099827e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 635/1024 [29:05:40<17:53:48, 165.63s/it][AINFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 636/1024 [29:08:24<17:48:09, 165.18s/it][A
+                                                         [A{'loss': 0.0279, 'grad_norm': 0.003403177484869957, 'learning_rate': 1e-05, 'num_tokens': 560119248.0, 'completions/mean_length': 5746.8828125, 'completions/min_length': 131.0, 'completions/max_length': 15724.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5746.8828125, 'completions/min_terminated_length': 131.0, 'completions/max_terminated_length': 15724.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015006184577941895, 'sampling/sampling_logp_difference/max': 14.25, 'sampling/importance_sampling_ratio/min': 6.475952432083432e-07, 'sampling/importance_sampling_ratio/mean': 0.9999486207962036, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6247628927230835, 'clip_ratio/low_mean': 2.7543567512111622e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.849658353123232e-06, 'clip_ratio/high_max': 1.9398633412492927e-05, 'clip_ratio/region_mean': 3.239322609260853e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 636/1024 [29:08:24<17:48:09, 165.18s/it][AINFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 637/1024 [29:11:12<17:49:07, 165.76s/it][A
+                                                         [A{'loss': 0.093, 'grad_norm': 0.004058506805449724, 'learning_rate': 1e-05, 'num_tokens': 561072493.0, 'completions/mean_length': 7313.7890625, 'completions/min_length': 1068.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7096.1044921875, 'completions/min_terminated_length': 1068.0, 'completions/max_terminated_length': 16209.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3079911172389984, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01940958946943283, 'sampling/sampling_logp_difference/max': 7.320003509521484, 'sampling/importance_sampling_ratio/min': 0.0006621598731726408, 'sampling/importance_sampling_ratio/mean': 0.9999264478683472, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8606570512056351, 'clip_ratio/low_mean': 4.927243321617425e-05, 'clip_ratio/low_min': 5.929088274569949e-06, 'clip_ratio/high_mean': 8.111364707019675e-06, 'clip_ratio/high_max': 2.857848289750109e-05, 'clip_ratio/region_mean': 5.738379809372418e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 637/1024 [29:11:12<17:49:07, 165.76s/it][AINFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 638/1024 [29:14:01<17:52:57, 166.78s/it][A
+                                                         [A{'loss': 0.098, 'grad_norm': 0.002768489997833967, 'learning_rate': 1e-05, 'num_tokens': 562048734.0, 'completions/mean_length': 7495.5078125, 'completions/min_length': 882.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7425.51953125, 'completions/min_terminated_length': 882.0, 'completions/max_terminated_length': 16093.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.344813734292984, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0189508069306612, 'sampling/sampling_logp_difference/max': 11.133618354797363, 'sampling/importance_sampling_ratio/min': 1.4612716768169776e-05, 'sampling/importance_sampling_ratio/mean': 0.9999319314956665, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8225502669811249, 'clip_ratio/low_mean': 4.890350828645751e-05, 'clip_ratio/low_min': 3.968002147303196e-06, 'clip_ratio/high_mean': 7.758043807370996e-06, 'clip_ratio/high_max': 2.7213282010052353e-05, 'clip_ratio/region_mean': 5.666155129802064e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 638/1024 [29:14:01<17:52:57, 166.78s/it][AINFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 639/1024 [29:16:56<18:06:14, 169.28s/it][A
+                                                         [A{'loss': 0.0507, 'grad_norm': 0.002966079628095031, 'learning_rate': 1e-05, 'num_tokens': 562945623.0, 'completions/mean_length': 6856.5703125, 'completions/min_length': 173.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6627.912109375, 'completions/min_terminated_length': 173.0, 'completions/max_terminated_length': 15894.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3016803562641144, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019664689898490906, 'sampling/sampling_logp_difference/max': 8.624966621398926, 'sampling/importance_sampling_ratio/min': 0.0001795661955839023, 'sampling/importance_sampling_ratio/mean': 0.9998261332511902, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8542520478367805, 'clip_ratio/low_mean': 4.9131452101391915e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.311648519385926e-06, 'clip_ratio/high_max': 2.5246594077543705e-05, 'clip_ratio/region_mean': 5.544310107552519e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 639/1024 [29:16:56<18:06:14, 169.28s/it][AINFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▎   | 640/1024 [29:19:39<17:52:00, 167.50s/it][A
+                                                         [A{'loss': 0.0164, 'grad_norm': 0.0021058651618659496, 'learning_rate': 1e-05, 'num_tokens': 563789214.0, 'completions/mean_length': 6463.2421875, 'completions/min_length': 812.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6305.77001953125, 'completions/min_terminated_length': 812.0, 'completions/max_terminated_length': 15231.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.24541424214839935, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01898353546857834, 'sampling/sampling_logp_difference/max': 7.749993324279785, 'sampling/importance_sampling_ratio/min': 0.00043074542190879583, 'sampling/importance_sampling_ratio/mean': 0.9998518824577332, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8427078947424889, 'clip_ratio/low_mean': 4.154238490627904e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.316983106240514e-06, 'clip_ratio/high_max': 1.2127683930884814e-05, 'clip_ratio/region_mean': 4.685936778514588e-05, 'epoch': 0.59}
+
+ 62%|██████▎   | 640/1024 [29:19:39<17:52:00, 167.50s/it][AINFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 641/1024 [29:22:20<17:36:48, 165.56s/it][A
+                                                         [A{'loss': 0.0816, 'grad_norm': 0.005890186410397291, 'learning_rate': 1e-05, 'num_tokens': 564596185.0, 'completions/mean_length': 6140.7734375, 'completions/min_length': 780.0, 'completions/max_length': 15232.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6140.7734375, 'completions/min_terminated_length': 780.0, 'completions/max_terminated_length': 15232.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.23486016690731049, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01930009014904499, 'sampling/sampling_logp_difference/max': 7.120187759399414, 'sampling/importance_sampling_ratio/min': 0.000808614946436137, 'sampling/importance_sampling_ratio/mean': 0.9998830556869507, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8800382614135742, 'clip_ratio/low_mean': 3.146892504446441e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1398174655805633e-06, 'clip_ratio/high_max': 1.2559269862322253e-05, 'clip_ratio/region_mean': 3.4608742623731814e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 641/1024 [29:22:20<17:36:48, 165.56s/it][AINFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 642/1024 [29:25:07<17:36:17, 165.91s/it][A
+                                                         [A{'loss': -0.0094, 'grad_norm': 0.003226465079933405, 'learning_rate': 1e-05, 'num_tokens': 565430387.0, 'completions/mean_length': 6361.703125, 'completions/min_length': 510.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6202.61962890625, 'completions/min_terminated_length': 510.0, 'completions/max_terminated_length': 16246.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2682726979255676, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019014433026313782, 'sampling/sampling_logp_difference/max': 5.405893802642822, 'sampling/importance_sampling_ratio/min': 0.004490039311349392, 'sampling/importance_sampling_ratio/mean': 0.9999127984046936, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8246701806783676, 'clip_ratio/low_mean': 4.3151162458343606e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2748337212542538e-06, 'clip_ratio/high_max': 5.099334885017015e-06, 'clip_ratio/region_mean': 4.442599617959786e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 642/1024 [29:25:07<17:36:17, 165.91s/it][AINFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 643/1024 [29:28:15<18:15:46, 172.56s/it][A
+                                                         [A{'loss': 0.0597, 'grad_norm': 0.003077681176364422, 'learning_rate': 1e-05, 'num_tokens': 566393214.0, 'completions/mean_length': 7363.5234375, 'completions/min_length': 706.0, 'completions/max_length': 16283.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7363.5234375, 'completions/min_terminated_length': 706.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.24830512702465057, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01871068961918354, 'sampling/sampling_logp_difference/max': 14.924853324890137, 'sampling/importance_sampling_ratio/min': 3.297756165920873e-07, 'sampling/importance_sampling_ratio/mean': 1.000014066696167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.828450471162796, 'clip_ratio/low_mean': 3.808748408573592e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.822751001640427e-06, 'clip_ratio/high_max': 2.8547008014356834e-05, 'clip_ratio/region_mean': 4.591023491684609e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 643/1024 [29:28:15<18:15:46, 172.56s/it][AINFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 644/1024 [29:31:06<18:10:46, 172.23s/it][A
+                                                         [A{'loss': -0.0326, 'grad_norm': 0.0023631115909665823, 'learning_rate': 1e-05, 'num_tokens': 567294697.0, 'completions/mean_length': 6883.8984375, 'completions/min_length': 830.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6809.09423828125, 'completions/min_terminated_length': 830.0, 'completions/max_terminated_length': 16016.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.22567616403102875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02030845358967781, 'sampling/sampling_logp_difference/max': 5.291841983795166, 'sampling/importance_sampling_ratio/min': 0.005032482091337442, 'sampling/importance_sampling_ratio/mean': 0.9999625086784363, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9114723727107048, 'clip_ratio/low_mean': 1.9775024611590197e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2140636727053788e-06, 'clip_ratio/high_max': 4.856254690821515e-06, 'clip_ratio/region_mean': 2.098908817060874e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 644/1024 [29:31:06<18:10:46, 172.23s/it][AINFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 645/1024 [29:33:47<17:44:59, 168.60s/it][A
+                                                         [A{'loss': 0.071, 'grad_norm': 0.006442595738917589, 'learning_rate': 1e-05, 'num_tokens': 568210240.0, 'completions/mean_length': 6996.9296875, 'completions/min_length': 1477.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6923.015625, 'completions/min_terminated_length': 1477.0, 'completions/max_terminated_length': 16376.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3061561584472656, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018702290952205658, 'sampling/sampling_logp_difference/max': 6.779873847961426, 'sampling/importance_sampling_ratio/min': 0.0011364181991666555, 'sampling/importance_sampling_ratio/mean': 0.9999593496322632, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7864109799265862, 'clip_ratio/low_mean': 3.9204465110742603e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.152158688455529e-06, 'clip_ratio/high_max': 4.608634753822116e-06, 'clip_ratio/region_mean': 4.035662391288497e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 645/1024 [29:33:47<17:44:59, 168.60s/it][AINFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 646/1024 [29:36:25<17:22:03, 165.40s/it][A
+                                                         [A{'loss': 0.0495, 'grad_norm': 0.004090449772775173, 'learning_rate': 1e-05, 'num_tokens': 569046727.0, 'completions/mean_length': 6384.5546875, 'completions/min_length': 878.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6305.81884765625, 'completions/min_terminated_length': 878.0, 'completions/max_terminated_length': 16367.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.3266732692718506, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017125204205513, 'sampling/sampling_logp_difference/max': 7.8639349937438965, 'sampling/importance_sampling_ratio/min': 0.00038435845635831356, 'sampling/importance_sampling_ratio/mean': 0.9999207854270935, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7353173196315765, 'clip_ratio/low_mean': 5.24772226526693e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.011521352571435e-06, 'clip_ratio/high_max': 1.442532902728999e-05, 'clip_ratio/region_mean': 5.748874355049338e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 646/1024 [29:36:25<17:22:03, 165.40s/it][AINFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 647/1024 [29:39:09<17:17:17, 165.09s/it][A
+                                                         [A{'loss': 0.0076, 'grad_norm': 0.0030447279568761587, 'learning_rate': 1e-05, 'num_tokens': 569975323.0, 'completions/mean_length': 7074.59375, 'completions/min_length': 623.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6696.29248046875, 'completions/min_terminated_length': 623.0, 'completions/max_terminated_length': 16258.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.17176413536071777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019303584471344948, 'sampling/sampling_logp_difference/max': 3.709077835083008, 'sampling/importance_sampling_ratio/min': 0.024500105530023575, 'sampling/importance_sampling_ratio/mean': 0.9999834299087524, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9198992624878883, 'clip_ratio/low_mean': 3.2856025427463464e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0019189125596313e-06, 'clip_ratio/high_max': 1.2007675650238525e-05, 'clip_ratio/region_mean': 3.585794411264942e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 647/1024 [29:39:09<17:17:17, 165.09s/it][AINFO 12-02 19:04:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:04:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:04:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:04:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 648/1024 [29:42:33<18:27:57, 176.80s/it][A
+                                                         [A{'loss': 0.0678, 'grad_norm': 0.004508152138441801, 'learning_rate': 1e-05, 'num_tokens': 571024900.0, 'completions/mean_length': 8044.2578125, 'completions/min_length': 902.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7181.52587890625, 'completions/min_terminated_length': 902.0, 'completions/max_terminated_length': 16211.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.26698729395866394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018804769963026047, 'sampling/sampling_logp_difference/max': 10.130229949951172, 'sampling/importance_sampling_ratio/min': 3.98563061025925e-05, 'sampling/importance_sampling_ratio/mean': 0.9999692440032959, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8030193895101547, 'clip_ratio/low_mean': 7.121561156964162e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5884191952864057e-06, 'clip_ratio/high_max': 6.353676781145623e-06, 'clip_ratio/region_mean': 7.280403042386752e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 648/1024 [29:42:33<18:27:57, 176.80s/it][AINFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 649/1024 [29:45:37<18:39:08, 179.06s/it][A
+                                                         [A{'loss': 0.0265, 'grad_norm': 0.003926917444914579, 'learning_rate': 1e-05, 'num_tokens': 572125141.0, 'completions/mean_length': 8451.7578125, 'completions/min_length': 813.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7922.94189453125, 'completions/min_terminated_length': 813.0, 'completions/max_terminated_length': 15903.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.19226360321044922, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021555956453084946, 'sampling/sampling_logp_difference/max': 16.238862991333008, 'sampling/importance_sampling_ratio/min': 8.862401301712453e-08, 'sampling/importance_sampling_ratio/mean': 0.9999009370803833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.008152723312378, 'clip_ratio/low_mean': 3.612134810282441e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7038794339896413e-06, 'clip_ratio/high_max': 6.815517735958565e-06, 'clip_ratio/region_mean': 3.7825227536814054e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 649/1024 [29:45:37<18:39:08, 179.06s/it][AINFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 650/1024 [29:48:47<18:55:56, 182.24s/it][A
+                                                         [A{'loss': 0.0367, 'grad_norm': 0.0036475847009569407, 'learning_rate': 1e-05, 'num_tokens': 573041934.0, 'completions/mean_length': 7011.8203125, 'completions/min_length': 728.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6786.88818359375, 'completions/min_terminated_length': 728.0, 'completions/max_terminated_length': 16120.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02046291157603264, 'sampling/sampling_logp_difference/max': 10.249990463256836, 'sampling/importance_sampling_ratio/min': 3.535783980623819e-05, 'sampling/importance_sampling_ratio/mean': 0.9999783039093018, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8761812150478363, 'clip_ratio/low_mean': 5.86272076361638e-05, 'clip_ratio/low_min': 1.1987166999460896e-05, 'clip_ratio/high_mean': 3.796089742991171e-06, 'clip_ratio/high_max': 1.5184358971964684e-05, 'clip_ratio/region_mean': 6.242329754968523e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 650/1024 [29:48:47<18:55:56, 182.24s/it][AINFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▎   | 651/1024 [29:51:40<18:36:20, 179.57s/it][A
+                                                         [A{'loss': 0.0678, 'grad_norm': 0.0038963130209594965, 'learning_rate': 1e-05, 'num_tokens': 574040917.0, 'completions/mean_length': 7665.7421875, 'completions/min_length': 816.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7161.3798828125, 'completions/min_terminated_length': 816.0, 'completions/max_terminated_length': 15510.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3169426918029785, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01943662390112877, 'sampling/sampling_logp_difference/max': 13.374890327453613, 'sampling/importance_sampling_ratio/min': 1.5536705859631184e-06, 'sampling/importance_sampling_ratio/mean': 0.9999545812606812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7933268994092941, 'clip_ratio/low_mean': 4.855269958170538e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.744779692420707e-06, 'clip_ratio/high_max': 1.0979118769682827e-05, 'clip_ratio/region_mean': 5.129747910359583e-05, 'epoch': 0.6}
+
+ 64%|██████▎   | 651/1024 [29:51:40<18:36:20, 179.57s/it][AINFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▎   | 652/1024 [29:54:45<18:42:07, 180.99s/it][A
+                                                         [A{'loss': 0.0418, 'grad_norm': 0.0030520735308527946, 'learning_rate': 1e-05, 'num_tokens': 575078695.0, 'completions/mean_length': 7966.828125, 'completions/min_length': 553.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7695.30615234375, 'completions/min_terminated_length': 553.0, 'completions/max_terminated_length': 16049.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.19332443177700043, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0197810810059309, 'sampling/sampling_logp_difference/max': 7.872013568878174, 'sampling/importance_sampling_ratio/min': 0.00038126588333398104, 'sampling/importance_sampling_ratio/mean': 1.0000214576721191, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8473240435123444, 'clip_ratio/low_mean': 2.4625115656817798e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.893257598974742e-06, 'clip_ratio/high_max': 9.610412234906107e-06, 'clip_ratio/region_mean': 2.8518373483166215e-05, 'epoch': 0.6}
+
+ 64%|██████▎   | 652/1024 [29:54:45<18:42:07, 180.99s/it][AINFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 653/1024 [29:57:06<17:26:01, 169.17s/it][A
+                                                         [A{'loss': 0.1305, 'grad_norm': 0.0029330148827284575, 'learning_rate': 1e-05, 'num_tokens': 575915163.0, 'completions/mean_length': 6384.53125, 'completions/min_length': 1045.0, 'completions/max_length': 15116.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6384.53125, 'completions/min_terminated_length': 1045.0, 'completions/max_terminated_length': 15116.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019899431616067886, 'sampling/sampling_logp_difference/max': 8.872506141662598, 'sampling/importance_sampling_ratio/min': 0.0001401908230036497, 'sampling/importance_sampling_ratio/mean': 0.9999364614486694, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9130589440464973, 'clip_ratio/low_mean': 3.762348410418781e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0246395049762214e-05, 'clip_ratio/high_max': 4.0985580199048854e-05, 'clip_ratio/region_mean': 4.7869878471829e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 653/1024 [29:57:06<17:26:01, 169.17s/it][AINFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 654/1024 [29:59:37<16:50:07, 163.80s/it][A
+                                                         [A{'loss': 0.0328, 'grad_norm': 0.0037648119032382965, 'learning_rate': 1e-05, 'num_tokens': 576895261.0, 'completions/mean_length': 7484.140625, 'completions/min_length': 745.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7414.06298828125, 'completions/min_terminated_length': 745.0, 'completions/max_terminated_length': 14716.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2987973093986511, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020455794408917427, 'sampling/sampling_logp_difference/max': 8.220190048217773, 'sampling/importance_sampling_ratio/min': 0.0002691639238037169, 'sampling/importance_sampling_ratio/mean': 0.9999864101409912, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8762720301747322, 'clip_ratio/low_mean': 4.3348386952857254e-05, 'clip_ratio/low_min': 3.435481630731374e-06, 'clip_ratio/high_mean': 1.2012300203423365e-06, 'clip_ratio/high_max': 4.804920081369346e-06, 'clip_ratio/region_mean': 4.454961697319959e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 654/1024 [29:59:37<16:50:07, 163.80s/it][AINFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 655/1024 [30:02:29<17:00:50, 165.99s/it][A
+                                                         [A{'loss': 0.0874, 'grad_norm': 0.0022230292670428753, 'learning_rate': 1e-05, 'num_tokens': 577874916.0, 'completions/mean_length': 7483.8671875, 'completions/min_length': 447.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7196.76611328125, 'completions/min_terminated_length': 447.0, 'completions/max_terminated_length': 15614.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3322049677371979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019235530868172646, 'sampling/sampling_logp_difference/max': 6.195826530456543, 'sampling/importance_sampling_ratio/min': 0.002037918195128441, 'sampling/importance_sampling_ratio/mean': 1.0000191926956177, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8481424525380135, 'clip_ratio/low_mean': 5.7342298759976984e-05, 'clip_ratio/low_min': 1.5017260921013076e-05, 'clip_ratio/high_mean': 5.822761295348755e-06, 'clip_ratio/high_max': 2.329104518139502e-05, 'clip_ratio/region_mean': 6.316505982795206e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 655/1024 [30:02:29<17:00:50, 165.99s/it][AINFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 656/1024 [30:05:24<17:14:51, 168.73s/it][A
+                                                         [A{'loss': 0.0249, 'grad_norm': 0.003690029727295041, 'learning_rate': 1e-05, 'num_tokens': 578741608.0, 'completions/mean_length': 6618.34375, 'completions/min_length': 563.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6541.44873046875, 'completions/min_terminated_length': 563.0, 'completions/max_terminated_length': 15621.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.22673210501670837, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019522596150636673, 'sampling/sampling_logp_difference/max': 10.818385124206543, 'sampling/importance_sampling_ratio/min': 2.0027882783324458e-05, 'sampling/importance_sampling_ratio/mean': 0.9998915195465088, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8699518665671349, 'clip_ratio/low_mean': 3.113216860128887e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0502737925198744e-06, 'clip_ratio/high_max': 8.201095170079498e-06, 'clip_ratio/region_mean': 3.318244205274823e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 656/1024 [30:05:24<17:14:51, 168.73s/it][AINFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 657/1024 [30:08:21<17:28:29, 171.41s/it][A
+                                                         [A{'loss': 0.0846, 'grad_norm': 0.004026883281767368, 'learning_rate': 1e-05, 'num_tokens': 579617377.0, 'completions/mean_length': 6699.6953125, 'completions/min_length': 693.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6223.41748046875, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 16165.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018191896378993988, 'sampling/sampling_logp_difference/max': 14.687499046325684, 'sampling/importance_sampling_ratio/min': 4.181192991836724e-07, 'sampling/importance_sampling_ratio/mean': 0.9997950792312622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7825306504964828, 'clip_ratio/low_mean': 5.6235591728182044e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0405914281363948e-06, 'clip_ratio/high_max': 4.162365712545579e-06, 'clip_ratio/region_mean': 5.7276183270005276e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 657/1024 [30:08:21<17:28:29, 171.41s/it][AINFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 658/1024 [30:10:41<16:27:35, 161.90s/it][A
+                                                         [A{'loss': 0.0796, 'grad_norm': 0.004194674547761679, 'learning_rate': 1e-05, 'num_tokens': 580402633.0, 'completions/mean_length': 5984.875, 'completions/min_length': 1404.0, 'completions/max_length': 15406.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5984.875, 'completions/min_terminated_length': 1404.0, 'completions/max_terminated_length': 15406.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019084136933088303, 'sampling/sampling_logp_difference/max': 5.749660015106201, 'sampling/importance_sampling_ratio/min': 0.003183862892910838, 'sampling/importance_sampling_ratio/mean': 0.9999486804008484, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8239431977272034, 'clip_ratio/low_mean': 3.858270270029607e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.379652520787204e-06, 'clip_ratio/high_max': 2.1518610083148815e-05, 'clip_ratio/region_mean': 4.396235544845695e-05, 'epoch': 0.61}
+
+ 64%|██████▍   | 658/1024 [30:10:41<16:27:35, 161.90s/it][AINFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 659/1024 [30:13:17<16:14:40, 160.22s/it][A
+                                                         [A{'loss': -0.001, 'grad_norm': 0.005016419570893049, 'learning_rate': 1e-05, 'num_tokens': 581187586.0, 'completions/mean_length': 5950.5703125, 'completions/min_length': 1140.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5784.96044921875, 'completions/min_terminated_length': 1140.0, 'completions/max_terminated_length': 15690.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2306838035583496, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01908070594072342, 'sampling/sampling_logp_difference/max': 5.809609413146973, 'sampling/importance_sampling_ratio/min': 0.002998600946739316, 'sampling/importance_sampling_ratio/mean': 0.9999349117279053, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8884857445955276, 'clip_ratio/low_mean': 5.0344978262728546e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.680707826944854e-06, 'clip_ratio/high_max': 2.6722831307779416e-05, 'clip_ratio/region_mean': 5.702568614651682e-05, 'epoch': 0.61}
+
+ 64%|██████▍   | 659/1024 [30:13:17<16:14:40, 160.22s/it][AINFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 660/1024 [30:16:13<16:39:42, 164.79s/it][A
+                                                         [A{'loss': 0.0718, 'grad_norm': 0.0035609283950179815, 'learning_rate': 1e-05, 'num_tokens': 582236557.0, 'completions/mean_length': 8029.7734375, 'completions/min_length': 1584.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7690.17041015625, 'completions/min_terminated_length': 1584.0, 'completions/max_terminated_length': 16101.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01982714608311653, 'sampling/sampling_logp_difference/max': 5.2554192543029785, 'sampling/importance_sampling_ratio/min': 0.005219157785177231, 'sampling/importance_sampling_ratio/mean': 0.999931275844574, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.858074463903904, 'clip_ratio/low_mean': 2.6390790822006238e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1695884697692236e-06, 'clip_ratio/high_max': 8.678353879076894e-06, 'clip_ratio/region_mean': 2.8560379291775462e-05, 'epoch': 0.61}
+
+ 64%|██████▍   | 660/1024 [30:16:13<16:39:42, 164.79s/it][AINFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 661/1024 [30:19:04<16:48:57, 166.77s/it][A
+                                                         [A{'loss': 0.0505, 'grad_norm': 0.005020176526159048, 'learning_rate': 1e-05, 'num_tokens': 583150740.0, 'completions/mean_length': 6958.4921875, 'completions/min_length': 904.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6494.9423828125, 'completions/min_terminated_length': 904.0, 'completions/max_terminated_length': 16063.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018450919538736343, 'sampling/sampling_logp_difference/max': 3.8077571392059326, 'sampling/importance_sampling_ratio/min': 0.022197909653186798, 'sampling/importance_sampling_ratio/mean': 0.999988853931427, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7957572638988495, 'clip_ratio/low_mean': 3.278200858858327e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.522766622969357e-06, 'clip_ratio/high_max': 2.362454961257754e-05, 'clip_ratio/region_mean': 4.030477487049211e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 661/1024 [30:19:04<16:48:57, 166.77s/it][AINFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 662/1024 [30:21:38<16:23:04, 162.94s/it][A
+                                                         [A{'loss': 0.0826, 'grad_norm': 0.005002971272915602, 'learning_rate': 1e-05, 'num_tokens': 584044250.0, 'completions/mean_length': 6810.234375, 'completions/min_length': 1105.0, 'completions/max_length': 15856.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6810.234375, 'completions/min_terminated_length': 1105.0, 'completions/max_terminated_length': 15856.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018436448648571968, 'sampling/sampling_logp_difference/max': 10.743270874023438, 'sampling/importance_sampling_ratio/min': 2.1590203687082976e-05, 'sampling/importance_sampling_ratio/mean': 0.9999277591705322, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7868659943342209, 'clip_ratio/low_mean': 4.201903630018933e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.383796982030617e-06, 'clip_ratio/high_max': 9.535187928122468e-06, 'clip_ratio/region_mean': 4.440283305484627e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 662/1024 [30:21:38<16:23:04, 162.94s/it][AINFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 663/1024 [30:24:36<16:46:14, 167.24s/it][A
+                                                         [A{'loss': 0.1195, 'grad_norm': 0.0021831525955349207, 'learning_rate': 1e-05, 'num_tokens': 584971568.0, 'completions/mean_length': 7106.296875, 'completions/min_length': 802.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6487.78369140625, 'completions/min_terminated_length': 802.0, 'completions/max_terminated_length': 16291.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.32772916555404663, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018681492656469345, 'sampling/sampling_logp_difference/max': 9.413989067077637, 'sampling/importance_sampling_ratio/min': 8.157488628057763e-05, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8079892098903656, 'clip_ratio/low_mean': 5.7681085309013724e-05, 'clip_ratio/low_min': 4.5418209992931224e-06, 'clip_ratio/high_mean': 9.566726021148497e-06, 'clip_ratio/high_max': 3.5268151805212256e-05, 'clip_ratio/region_mean': 6.724781314915163e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 663/1024 [30:24:36<16:46:14, 167.24s/it][AINFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 664/1024 [30:27:20<16:37:47, 166.30s/it][A
+                                                         [A{'loss': 0.0642, 'grad_norm': 0.004661369137465954, 'learning_rate': 1e-05, 'num_tokens': 585916134.0, 'completions/mean_length': 7235.046875, 'completions/min_length': 1472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7089.82568359375, 'completions/min_terminated_length': 1472.0, 'completions/max_terminated_length': 16363.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.322716623544693, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018921509385108948, 'sampling/sampling_logp_difference/max': 7.249154567718506, 'sampling/importance_sampling_ratio/min': 0.0007107750861905515, 'sampling/importance_sampling_ratio/mean': 1.0000330209732056, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8041050210595131, 'clip_ratio/low_mean': 3.626145735324826e-05, 'clip_ratio/low_min': 3.933786501875147e-06, 'clip_ratio/high_mean': 1.1574332802410936e-05, 'clip_ratio/high_max': 4.332071557655581e-05, 'clip_ratio/region_mean': 4.783579004197236e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 664/1024 [30:27:20<16:37:47, 166.30s/it][AINFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 665/1024 [30:30:06<16:35:59, 166.46s/it][A
+                                                         [A{'loss': 0.0538, 'grad_norm': 0.0024479639250785112, 'learning_rate': 1e-05, 'num_tokens': 586841633.0, 'completions/mean_length': 7077.5859375, 'completions/min_length': 944.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6777.37890625, 'completions/min_terminated_length': 944.0, 'completions/max_terminated_length': 13888.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019227145239710808, 'sampling/sampling_logp_difference/max': 8.160323143005371, 'sampling/importance_sampling_ratio/min': 0.00028577001648955047, 'sampling/importance_sampling_ratio/mean': 0.9998941421508789, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8417644873261452, 'clip_ratio/low_mean': 2.6745638365355262e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.737838710549113e-06, 'clip_ratio/high_max': 1.4951354842196452e-05, 'clip_ratio/region_mean': 3.0483477416964888e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 665/1024 [30:30:06<16:35:59, 166.46s/it][AINFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▌   | 666/1024 [30:33:15<17:11:59, 172.96s/it][A
+                                                         [A{'loss': 0.0352, 'grad_norm': 0.005297356750816107, 'learning_rate': 1e-05, 'num_tokens': 587897122.0, 'completions/mean_length': 8090.3203125, 'completions/min_length': 768.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7463.0673828125, 'completions/min_terminated_length': 768.0, 'completions/max_terminated_length': 15900.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.27851754426956177, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018079372122883797, 'sampling/sampling_logp_difference/max': 7.353616237640381, 'sampling/importance_sampling_ratio/min': 0.0006402728031389415, 'sampling/importance_sampling_ratio/mean': 0.9999694228172302, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7603196427226067, 'clip_ratio/low_mean': 4.123253006582672e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.19675950272358e-06, 'clip_ratio/high_max': 1.7368187855026918e-05, 'clip_ratio/region_mean': 4.642928979592398e-05, 'epoch': 0.61}
+
+ 65%|██████▌   | 666/1024 [30:33:15<17:11:59, 172.96s/it][AINFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▌   | 667/1024 [30:35:55<16:46:21, 169.14s/it][A
+                                                         [A{'loss': 0.0695, 'grad_norm': 0.003049109596759081, 'learning_rate': 1e-05, 'num_tokens': 588801206.0, 'completions/mean_length': 6908.96875, 'completions/min_length': 406.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6360.826171875, 'completions/min_terminated_length': 406.0, 'completions/max_terminated_length': 15514.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018563130870461464, 'sampling/sampling_logp_difference/max': 5.573733329772949, 'sampling/importance_sampling_ratio/min': 0.0037962812930345535, 'sampling/importance_sampling_ratio/mean': 0.9999892711639404, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7355617135763168, 'clip_ratio/low_mean': 2.9263440183058265e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.941788918382372e-06, 'clip_ratio/high_max': 1.5767155673529487e-05, 'clip_ratio/region_mean': 3.3205229101440636e-05, 'epoch': 0.61}
+
+ 65%|██████▌   | 667/1024 [30:35:55<16:46:21, 169.14s/it][AINFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▌   | 668/1024 [30:38:49<16:52:25, 170.63s/it][A
+                                                         [A{'loss': 0.1521, 'grad_norm': 0.0034495368599891663, 'learning_rate': 1e-05, 'num_tokens': 589732588.0, 'completions/mean_length': 7110.109375, 'completions/min_length': 1008.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6810.951171875, 'completions/min_terminated_length': 1008.0, 'completions/max_terminated_length': 16333.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.326668381690979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016679491847753525, 'sampling/sampling_logp_difference/max': 7.4639434814453125, 'sampling/importance_sampling_ratio/min': 0.000573390512727201, 'sampling/importance_sampling_ratio/mean': 0.9999086856842041, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.688617967069149, 'clip_ratio/low_mean': 6.839358093202463e-05, 'clip_ratio/low_min': 9.10438984647044e-06, 'clip_ratio/high_mean': 4.312999067224155e-06, 'clip_ratio/high_max': 1.725199626889662e-05, 'clip_ratio/region_mean': 7.27065794308146e-05, 'epoch': 0.61}
+
+ 65%|██████▌   | 668/1024 [30:38:49<16:52:25, 170.63s/it][AINFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▌   | 669/1024 [30:41:44<16:56:45, 171.85s/it][A
+                                                         [A{'loss': 0.0258, 'grad_norm': 0.004971730522811413, 'learning_rate': 1e-05, 'num_tokens': 590717118.0, 'completions/mean_length': 7533.578125, 'completions/min_length': 1321.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7021.56982421875, 'completions/min_terminated_length': 1321.0, 'completions/max_terminated_length': 16263.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.30904704332351685, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01741175726056099, 'sampling/sampling_logp_difference/max': 9.96833324432373, 'sampling/importance_sampling_ratio/min': 4.6860604925313964e-05, 'sampling/importance_sampling_ratio/mean': 0.9998904466629028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7306379675865173, 'clip_ratio/low_mean': 5.138145911587344e-05, 'clip_ratio/low_min': 3.9801311686460394e-06, 'clip_ratio/high_mean': 2.31802277994575e-06, 'clip_ratio/high_max': 5.049688752478687e-06, 'clip_ratio/region_mean': 5.369948189581919e-05, 'epoch': 0.62}
+
+ 65%|██████▌   | 669/1024 [30:41:44<16:56:45, 171.85s/it][AINFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▌   | 670/1024 [30:44:18<16:23:01, 166.61s/it][A
+                                                         [A{'loss': 0.0893, 'grad_norm': 0.003072877414524555, 'learning_rate': 1e-05, 'num_tokens': 591524494.0, 'completions/mean_length': 6165.0, 'completions/min_length': 1088.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6002.7939453125, 'completions/min_terminated_length': 1088.0, 'completions/max_terminated_length': 15983.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.28353992104530334, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016893092542886734, 'sampling/sampling_logp_difference/max': 8.5533447265625, 'sampling/importance_sampling_ratio/min': 0.00019289882038719952, 'sampling/importance_sampling_ratio/mean': 1.0000028610229492, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7227498516440392, 'clip_ratio/low_mean': 4.160707453593204e-05, 'clip_ratio/low_min': 7.402582014037762e-06, 'clip_ratio/high_mean': 3.4612473314155068e-06, 'clip_ratio/high_max': 1.3844989325662027e-05, 'clip_ratio/region_mean': 4.506832192419097e-05, 'epoch': 0.62}
+
+ 65%|██████▌   | 670/1024 [30:44:18<16:23:01, 166.61s/it][AINFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 671/1024 [30:46:54<16:01:07, 163.36s/it][A
+                                                         [A{'loss': 0.0619, 'grad_norm': 0.003992745652794838, 'learning_rate': 1e-05, 'num_tokens': 592320726.0, 'completions/mean_length': 6061.9375, 'completions/min_length': 973.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5728.9677734375, 'completions/min_terminated_length': 973.0, 'completions/max_terminated_length': 15451.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018718186765909195, 'sampling/sampling_logp_difference/max': 9.499366760253906, 'sampling/importance_sampling_ratio/min': 7.489924610126764e-05, 'sampling/importance_sampling_ratio/mean': 0.9999755620956421, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.813653938472271, 'clip_ratio/low_mean': 3.8767432329223084e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.71779502631398e-06, 'clip_ratio/high_max': 3.056439982174197e-05, 'clip_ratio/region_mean': 4.7485227241850225e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 671/1024 [30:46:54<16:01:07, 163.36s/it][AINFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 672/1024 [30:50:04<16:46:26, 171.55s/it][A
+                                                         [A{'loss': 0.1016, 'grad_norm': 0.003727070288732648, 'learning_rate': 1e-05, 'num_tokens': 593270695.0, 'completions/mean_length': 7265.9453125, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6658.0751953125, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 16210.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.30327796936035156, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017805757001042366, 'sampling/sampling_logp_difference/max': 13.115309715270996, 'sampling/importance_sampling_ratio/min': 2.014157189478283e-06, 'sampling/importance_sampling_ratio/mean': 0.999910831451416, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7658502459526062, 'clip_ratio/low_mean': 3.851054543702048e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.138349368076888e-06, 'clip_ratio/high_max': 1.655339747230755e-05, 'clip_ratio/region_mean': 4.264889435035002e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 672/1024 [30:50:04<16:46:26, 171.55s/it][AINFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 673/1024 [30:53:14<17:15:09, 176.95s/it][A
+                                                         [A{'loss': 0.0535, 'grad_norm': 0.0038730741944164038, 'learning_rate': 1e-05, 'num_tokens': 594386261.0, 'completions/mean_length': 8564.046875, 'completions/min_length': 968.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7680.0517578125, 'completions/min_terminated_length': 968.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.32483339309692383, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01785116083920002, 'sampling/sampling_logp_difference/max': 8.660311698913574, 'sampling/importance_sampling_ratio/min': 0.00017333027790300548, 'sampling/importance_sampling_ratio/mean': 0.9999313354492188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6856872886419296, 'clip_ratio/low_mean': 5.263989112336276e-05, 'clip_ratio/low_min': 1.2888257515442092e-05, 'clip_ratio/high_mean': 6.335726652650919e-06, 'clip_ratio/high_max': 2.0501698145380942e-05, 'clip_ratio/region_mean': 5.897561732126633e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 673/1024 [30:53:14<17:15:09, 176.95s/it][AINFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 674/1024 [30:56:08<17:07:17, 176.11s/it][A
+                                                         [A{'loss': 0.0565, 'grad_norm': 0.004014961421489716, 'learning_rate': 1e-05, 'num_tokens': 595407313.0, 'completions/mean_length': 7838.28125, 'completions/min_length': 872.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7343.900390625, 'completions/min_terminated_length': 872.0, 'completions/max_terminated_length': 16349.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3148210048675537, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01619477942585945, 'sampling/sampling_logp_difference/max': 13.904884338378906, 'sampling/importance_sampling_ratio/min': 9.145037438429426e-07, 'sampling/importance_sampling_ratio/mean': 0.999966025352478, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.636501632630825, 'clip_ratio/low_mean': 4.970566510564822e-05, 'clip_ratio/low_min': 4.473552507988643e-06, 'clip_ratio/high_mean': 8.523603241883393e-06, 'clip_ratio/high_max': 2.6982705094269477e-05, 'clip_ratio/region_mean': 5.82292680064711e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 674/1024 [30:56:08<17:07:17, 176.11s/it][AINFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 675/1024 [30:59:05<17:05:21, 176.28s/it][A
+                                                         [A{'loss': 0.0855, 'grad_norm': 0.004226911347359419, 'learning_rate': 1e-05, 'num_tokens': 596291470.0, 'completions/mean_length': 6784.5390625, 'completions/min_length': 906.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6229.1982421875, 'completions/min_terminated_length': 906.0, 'completions/max_terminated_length': 16323.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2409384697675705, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.015974994748830795, 'sampling/sampling_logp_difference/max': 8.49952507019043, 'sampling/importance_sampling_ratio/min': 0.00020356501045171171, 'sampling/importance_sampling_ratio/mean': 0.9999697208404541, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6435417085886002, 'clip_ratio/low_mean': 2.8467071842896985e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4123655773801147e-06, 'clip_ratio/high_max': 5.649462309520459e-06, 'clip_ratio/region_mean': 2.98794374202771e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 675/1024 [30:59:05<17:05:21, 176.28s/it][AINFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 676/1024 [31:02:07<17:13:25, 178.18s/it][A
+                                                         [A{'loss': 0.0675, 'grad_norm': 0.0031262668780982494, 'learning_rate': 1e-05, 'num_tokens': 597291107.0, 'completions/mean_length': 7650.2265625, 'completions/min_length': 1063.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6989.689453125, 'completions/min_terminated_length': 1063.0, 'completions/max_terminated_length': 16122.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2012200653553009, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01801086962223053, 'sampling/sampling_logp_difference/max': 14.7490234375, 'sampling/importance_sampling_ratio/min': 3.9317012578976573e-07, 'sampling/importance_sampling_ratio/mean': 0.9998708963394165, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7500722259283066, 'clip_ratio/low_mean': 2.2315146964047017e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.2315146964047017e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 676/1024 [31:02:07<17:13:25, 178.18s/it][AINFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 677/1024 [31:05:01<17:03:20, 176.95s/it][A
+                                                         [A{'loss': 0.0759, 'grad_norm': 0.006345350295305252, 'learning_rate': 1e-05, 'num_tokens': 598129568.0, 'completions/mean_length': 6377.4140625, 'completions/min_length': 478.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6298.6220703125, 'completions/min_terminated_length': 478.0, 'completions/max_terminated_length': 15718.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.31929677724838257, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01888679713010788, 'sampling/sampling_logp_difference/max': 9.620697021484375, 'sampling/importance_sampling_ratio/min': 6.634136661887169e-05, 'sampling/importance_sampling_ratio/mean': 1.0000131130218506, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8221950903534889, 'clip_ratio/low_mean': 5.510050823431811e-05, 'clip_ratio/low_min': 4.993807579012355e-06, 'clip_ratio/high_mean': 5.693989294286439e-06, 'clip_ratio/high_max': 2.2775957177145756e-05, 'clip_ratio/region_mean': 6.079449713070062e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 677/1024 [31:05:01<17:03:20, 176.95s/it][AINFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 678/1024 [31:07:45<16:36:23, 172.78s/it][A
+                                                         [A{'loss': 0.0811, 'grad_norm': 0.003562809666618705, 'learning_rate': 1e-05, 'num_tokens': 598862361.0, 'completions/mean_length': 5567.2578125, 'completions/min_length': 927.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5218.33056640625, 'completions/min_terminated_length': 927.0, 'completions/max_terminated_length': 14250.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016367387026548386, 'sampling/sampling_logp_difference/max': 15.2844877243042, 'sampling/importance_sampling_ratio/min': 2.3016077932425105e-07, 'sampling/importance_sampling_ratio/mean': 0.9999499320983887, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7284790053963661, 'clip_ratio/low_mean': 5.5144641464721644e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.139227250263502e-06, 'clip_ratio/high_max': 2.3920926196296932e-05, 'clip_ratio/region_mean': 6.228386882867198e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 678/1024 [31:07:45<16:36:23, 172.78s/it][AINFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▋   | 679/1024 [31:10:39<16:35:49, 173.19s/it][A
+                                                         [A{'loss': 0.0517, 'grad_norm': 0.0031477995216846466, 'learning_rate': 1e-05, 'num_tokens': 599921233.0, 'completions/mean_length': 8128.375, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7578.00048828125, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15202.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019213391467928886, 'sampling/sampling_logp_difference/max': 8.569145202636719, 'sampling/importance_sampling_ratio/min': 0.00018987487419508398, 'sampling/importance_sampling_ratio/mean': 0.9999460577964783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7838430106639862, 'clip_ratio/low_mean': 2.498499657122011e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.364888013697055e-06, 'clip_ratio/high_max': 1.4490571629721671e-05, 'clip_ratio/region_mean': 2.934988481229084e-05, 'epoch': 0.62}
+
+ 66%|██████▋   | 679/1024 [31:10:39<16:35:49, 173.19s/it][AINFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▋   | 680/1024 [31:13:29<16:27:32, 172.25s/it][A
+                                                         [A{'loss': 0.0896, 'grad_norm': 0.0034168637357652187, 'learning_rate': 1e-05, 'num_tokens': 600895023.0, 'completions/mean_length': 7452.296875, 'completions/min_length': 799.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7013.0322265625, 'completions/min_terminated_length': 799.0, 'completions/max_terminated_length': 15879.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3061561584472656, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019955754280090332, 'sampling/sampling_logp_difference/max': 7.843585014343262, 'sampling/importance_sampling_ratio/min': 0.0003922602627426386, 'sampling/importance_sampling_ratio/mean': 0.9999901056289673, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8657966181635857, 'clip_ratio/low_mean': 3.322141196804296e-05, 'clip_ratio/low_min': 2.5509161787340418e-06, 'clip_ratio/high_mean': 8.023214263630507e-06, 'clip_ratio/high_max': 2.650051692398847e-05, 'clip_ratio/region_mean': 4.124462532217876e-05, 'epoch': 0.63}
+
+ 66%|██████▋   | 680/1024 [31:13:29<16:27:32, 172.25s/it][AINFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 681/1024 [31:16:16<16:16:19, 170.78s/it][A
+                                                         [A{'loss': 0.105, 'grad_norm': 0.0026859277859330177, 'learning_rate': 1e-05, 'num_tokens': 601887935.0, 'completions/mean_length': 7581.625, 'completions/min_length': 1686.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7072.396484375, 'completions/min_terminated_length': 1686.0, 'completions/max_terminated_length': 15759.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3295465111732483, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018653862178325653, 'sampling/sampling_logp_difference/max': 3.5232622623443604, 'sampling/importance_sampling_ratio/min': 0.029503032565116882, 'sampling/importance_sampling_ratio/mean': 0.9999804496765137, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.764233261346817, 'clip_ratio/low_mean': 5.516502255886735e-05, 'clip_ratio/low_min': 5.772084023192292e-06, 'clip_ratio/high_mean': 2.0586570599334664e-06, 'clip_ratio/high_max': 8.234628239733865e-06, 'clip_ratio/region_mean': 5.7223681096729706e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 681/1024 [31:16:16<16:16:19, 170.78s/it][AINFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 682/1024 [31:19:16<16:29:32, 173.60s/it][A
+                                                         [A{'loss': 0.0491, 'grad_norm': 0.002624326851218939, 'learning_rate': 1e-05, 'num_tokens': 603035462.0, 'completions/mean_length': 8824.2421875, 'completions/min_length': 1991.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8452.4501953125, 'completions/min_terminated_length': 1991.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01799936406314373, 'sampling/sampling_logp_difference/max': 9.874824523925781, 'sampling/importance_sampling_ratio/min': 5.1453887863317505e-05, 'sampling/importance_sampling_ratio/mean': 0.9999333024024963, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7557987719774246, 'clip_ratio/low_mean': 5.129833289174712e-05, 'clip_ratio/low_min': 5.234505806583911e-06, 'clip_ratio/high_mean': 6.635149020439712e-06, 'clip_ratio/high_max': 2.654059608175885e-05, 'clip_ratio/region_mean': 5.793348100269213e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 682/1024 [31:19:16<16:29:32, 173.60s/it][AINFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 683/1024 [31:22:02<16:13:28, 171.29s/it][A
+                                                         [A{'loss': 0.0796, 'grad_norm': 0.005783884786069393, 'learning_rate': 1e-05, 'num_tokens': 603801083.0, 'completions/mean_length': 5832.7890625, 'completions/min_length': 948.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5749.70849609375, 'completions/min_terminated_length': 948.0, 'completions/max_terminated_length': 16189.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01805710420012474, 'sampling/sampling_logp_difference/max': 3.399966239929199, 'sampling/importance_sampling_ratio/min': 0.033374395221471786, 'sampling/importance_sampling_ratio/mean': 1.0000687837600708, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8034545630216599, 'clip_ratio/low_mean': 3.1395032920045196e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.1395032920045196e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 683/1024 [31:22:02<16:13:28, 171.29s/it][AINFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 684/1024 [31:24:54<16:12:00, 171.53s/it][A
+                                                         [A{'loss': 0.0832, 'grad_norm': 0.005038067698478699, 'learning_rate': 1e-05, 'num_tokens': 604748150.0, 'completions/mean_length': 7247.4609375, 'completions/min_length': 416.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7102.43701171875, 'completions/min_terminated_length': 416.0, 'completions/max_terminated_length': 16221.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.43106767535209656, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01983889564871788, 'sampling/sampling_logp_difference/max': 5.781791687011719, 'sampling/importance_sampling_ratio/min': 0.0030831864569336176, 'sampling/importance_sampling_ratio/mean': 0.9999319314956665, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.908146396279335, 'clip_ratio/low_mean': 5.521000275621191e-05, 'clip_ratio/low_min': 9.064021924132248e-06, 'clip_ratio/high_mean': 6.736250270478195e-06, 'clip_ratio/high_max': 2.2193052700458793e-05, 'clip_ratio/region_mean': 6.19462530266901e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 684/1024 [31:24:54<16:12:00, 171.53s/it][AINFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 685/1024 [31:27:35<15:50:10, 168.17s/it][A
+                                                         [A{'loss': 0.0967, 'grad_norm': 0.0053928992711007595, 'learning_rate': 1e-05, 'num_tokens': 605642768.0, 'completions/mean_length': 6861.078125, 'completions/min_length': 530.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6473.96728515625, 'completions/min_terminated_length': 530.0, 'completions/max_terminated_length': 15519.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.40503159165382385, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018197370693087578, 'sampling/sampling_logp_difference/max': 9.99008560180664, 'sampling/importance_sampling_ratio/min': 4.585228089126758e-05, 'sampling/importance_sampling_ratio/mean': 0.9999208450317383, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7612876370549202, 'clip_ratio/low_mean': 6.599987852951017e-05, 'clip_ratio/low_min': 1.7551100199852954e-05, 'clip_ratio/high_mean': 2.157538972369366e-06, 'clip_ratio/high_max': 8.630155889477464e-06, 'clip_ratio/region_mean': 6.815741778609663e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 685/1024 [31:27:35<15:50:10, 168.17s/it][AINFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 686/1024 [31:30:24<15:50:11, 168.67s/it][A
+                                                         [A{'loss': 0.1155, 'grad_norm': 0.0032709913793951273, 'learning_rate': 1e-05, 'num_tokens': 606534577.0, 'completions/mean_length': 6837.8203125, 'completions/min_length': 558.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6201.40869140625, 'completions/min_terminated_length': 558.0, 'completions/max_terminated_length': 16310.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2567248046398163, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016331009566783905, 'sampling/sampling_logp_difference/max': 10.062474250793457, 'sampling/importance_sampling_ratio/min': 4.2650382965803146e-05, 'sampling/importance_sampling_ratio/mean': 0.9999561309814453, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6217481270432472, 'clip_ratio/low_mean': 5.132838714416721e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.32969795333338e-06, 'clip_ratio/high_max': 2.531879181333352e-05, 'clip_ratio/region_mean': 5.765808464275324e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 686/1024 [31:30:24<15:50:11, 168.67s/it][AINFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 687/1024 [31:33:08<15:38:48, 167.15s/it][A
+                                                         [A{'loss': 0.0581, 'grad_norm': 0.002640153281390667, 'learning_rate': 1e-05, 'num_tokens': 607381811.0, 'completions/mean_length': 6458.703125, 'completions/min_length': 454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6380.55126953125, 'completions/min_terminated_length': 454.0, 'completions/max_terminated_length': 15103.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2382800281047821, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017693117260932922, 'sampling/sampling_logp_difference/max': 13.93669319152832, 'sampling/importance_sampling_ratio/min': 8.858721116666857e-07, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7460968196392059, 'clip_ratio/low_mean': 5.021198876420385e-05, 'clip_ratio/low_min': 4.219409220240777e-06, 'clip_ratio/high_mean': 3.581897317417315e-06, 'clip_ratio/high_max': 1.0992388070008019e-05, 'clip_ratio/region_mean': 5.379388539950014e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 687/1024 [31:33:08<15:38:48, 167.15s/it][AINFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 688/1024 [31:35:53<15:32:14, 166.47s/it][A
+                                                         [A{'loss': 0.0635, 'grad_norm': 0.003713687416166067, 'learning_rate': 1e-05, 'num_tokens': 608302256.0, 'completions/mean_length': 7043.1640625, 'completions/min_length': 952.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6894.8974609375, 'completions/min_terminated_length': 952.0, 'completions/max_terminated_length': 16121.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2648528814315796, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019254781305789948, 'sampling/sampling_logp_difference/max': 16.12498664855957, 'sampling/importance_sampling_ratio/min': 9.931326871992496e-08, 'sampling/importance_sampling_ratio/mean': 0.9999112486839294, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7884078621864319, 'clip_ratio/low_mean': 6.0473582834674744e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1366002417598793e-06, 'clip_ratio/high_max': 1.2546400967039517e-05, 'clip_ratio/region_mean': 6.361018404277274e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 688/1024 [31:35:53<15:32:14, 166.47s/it][AINFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 689/1024 [31:38:48<15:44:30, 169.17s/it][A
+                                                         [A{'loss': 0.0275, 'grad_norm': 0.004894682671874762, 'learning_rate': 1e-05, 'num_tokens': 609348299.0, 'completions/mean_length': 8012.8359375, 'completions/min_length': 866.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7742.79833984375, 'completions/min_terminated_length': 866.0, 'completions/max_terminated_length': 15487.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3027411997318268, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01958826184272766, 'sampling/sampling_logp_difference/max': 6.1424455642700195, 'sampling/importance_sampling_ratio/min': 0.0021496599074453115, 'sampling/importance_sampling_ratio/mean': 1.0000343322753906, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8368816301226616, 'clip_ratio/low_mean': 4.4303845015747356e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.971898143438011e-06, 'clip_ratio/high_max': 7.887592573752045e-06, 'clip_ratio/region_mean': 4.6275743216028786e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 689/1024 [31:38:48<15:44:30, 169.17s/it][AINFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 690/1024 [31:41:48<15:58:40, 172.22s/it][A
+                                                         [A{'loss': 0.0574, 'grad_norm': 0.0039004215504974127, 'learning_rate': 1e-05, 'num_tokens': 610341090.0, 'completions/mean_length': 7594.3671875, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7008.39208984375, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 16065.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3284856975078583, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01734849065542221, 'sampling/sampling_logp_difference/max': 10.124999046325684, 'sampling/importance_sampling_ratio/min': 4.006533345091157e-05, 'sampling/importance_sampling_ratio/mean': 0.9999041557312012, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.692665733397007, 'clip_ratio/low_mean': 3.859445814669016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6727028625828098e-06, 'clip_ratio/high_max': 1.0690811450331239e-05, 'clip_ratio/region_mean': 4.1267160668212455e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 690/1024 [31:41:48<15:58:40, 172.22s/it][AINFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 691/1024 [31:44:47<16:06:46, 174.19s/it][A
+                                                         [A{'loss': 0.0782, 'grad_norm': 0.004913663491606712, 'learning_rate': 1e-05, 'num_tokens': 611339726.0, 'completions/mean_length': 7640.09375, 'completions/min_length': 826.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7358.0322265625, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019604282453656197, 'sampling/sampling_logp_difference/max': 16.983896255493164, 'sampling/importance_sampling_ratio/min': 4.2071459205317296e-08, 'sampling/importance_sampling_ratio/mean': 0.9998912811279297, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8469130471348763, 'clip_ratio/low_mean': 5.9335616697353544e-05, 'clip_ratio/low_min': 5.472375505632954e-06, 'clip_ratio/high_mean': 2.7999831218039617e-06, 'clip_ratio/high_max': 4.406994776218198e-06, 'clip_ratio/region_mean': 6.21355998191575e-05, 'epoch': 0.64}
+
+ 67%|██████▋   | 691/1024 [31:44:47<16:06:46, 174.19s/it][AINFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 692/1024 [31:47:17<15:24:17, 167.04s/it][A
+                                                         [A{'loss': 0.0371, 'grad_norm': 0.0032354791183024645, 'learning_rate': 1e-05, 'num_tokens': 612005495.0, 'completions/mean_length': 5063.6953125, 'completions/min_length': 319.0, 'completions/max_length': 15895.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5063.6953125, 'completions/min_terminated_length': 319.0, 'completions/max_terminated_length': 15895.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0178166925907135, 'sampling/sampling_logp_difference/max': 3.8934366703033447, 'sampling/importance_sampling_ratio/min': 0.02037520334124565, 'sampling/importance_sampling_ratio/mean': 0.9999009370803833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7586102113127708, 'clip_ratio/low_mean': 2.7830240469484124e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.209913979342673e-06, 'clip_ratio/high_max': 1.4971937162044924e-05, 'clip_ratio/region_mean': 3.304015490357415e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 692/1024 [31:47:17<15:24:17, 167.04s/it][AINFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 693/1024 [31:49:53<15:04:06, 163.89s/it][A
+                                                         [A{'loss': 0.0357, 'grad_norm': 0.004039868246763945, 'learning_rate': 1e-05, 'num_tokens': 612870060.0, 'completions/mean_length': 6542.1640625, 'completions/min_length': 665.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6385.94482421875, 'completions/min_terminated_length': 665.0, 'completions/max_terminated_length': 14868.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019780561327934265, 'sampling/sampling_logp_difference/max': 24.499982833862305, 'sampling/importance_sampling_ratio/min': 2.2897740994953786e-11, 'sampling/importance_sampling_ratio/mean': 0.9998836517333984, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.847448967397213, 'clip_ratio/low_mean': 1.1576638144106255e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.3344492698670365e-06, 'clip_ratio/high_max': 2.1337797079468146e-05, 'clip_ratio/region_mean': 1.691108741397329e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 693/1024 [31:49:53<15:04:06, 163.89s/it][AINFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 694/1024 [31:52:36<14:59:16, 163.50s/it][A
+                                                         [A{'loss': 0.0632, 'grad_norm': 0.0023007066920399666, 'learning_rate': 1e-05, 'num_tokens': 613633581.0, 'completions/mean_length': 5805.8203125, 'completions/min_length': 919.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5551.9443359375, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 16287.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.23857943713665009, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016993921250104904, 'sampling/sampling_logp_difference/max': 8.249631881713867, 'sampling/importance_sampling_ratio/min': 0.00026135475491173565, 'sampling/importance_sampling_ratio/mean': 1.0000262260437012, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6972410827875137, 'clip_ratio/low_mean': 3.4833526569855167e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5834565374461818e-06, 'clip_ratio/high_max': 6.333826149784727e-06, 'clip_ratio/region_mean': 3.641698299361451e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 694/1024 [31:52:36<14:59:16, 163.50s/it][AINFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 695/1024 [31:55:51<15:47:57, 172.88s/it][A
+                                                         [A{'loss': 0.0413, 'grad_norm': 0.0029130352195352316, 'learning_rate': 1e-05, 'num_tokens': 614611881.0, 'completions/mean_length': 7504.65625, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 6586.103515625, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 16249.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.23250606656074524, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018666472285985947, 'sampling/sampling_logp_difference/max': 13.109896659851074, 'sampling/importance_sampling_ratio/min': 2.025089543167269e-06, 'sampling/importance_sampling_ratio/mean': 0.999863862991333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7908455803990364, 'clip_ratio/low_mean': 1.501361566624837e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6609881754447997e-06, 'clip_ratio/high_max': 6.643952701779199e-06, 'clip_ratio/region_mean': 1.667460389853659e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 695/1024 [31:55:51<15:47:57, 172.88s/it][AINFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 696/1024 [31:57:56<14:27:16, 158.65s/it][A
+                                                         [A{'loss': 0.1054, 'grad_norm': 0.0020515238866209984, 'learning_rate': 1e-05, 'num_tokens': 615355915.0, 'completions/mean_length': 5627.265625, 'completions/min_length': 233.0, 'completions/max_length': 13984.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5627.265625, 'completions/min_terminated_length': 233.0, 'completions/max_terminated_length': 13984.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.26827272772789, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01694992370903492, 'sampling/sampling_logp_difference/max': 5.874999046325684, 'sampling/importance_sampling_ratio/min': 0.002808797173202038, 'sampling/importance_sampling_ratio/mean': 0.9999716877937317, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7167766839265823, 'clip_ratio/low_mean': 5.670640712196473e-05, 'clip_ratio/low_min': 6.148246484372066e-06, 'clip_ratio/high_mean': 4.543699901660148e-06, 'clip_ratio/high_max': 1.817479960664059e-05, 'clip_ratio/region_mean': 6.125010668256436e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 696/1024 [31:57:56<14:27:16, 158.65s/it][AINFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 697/1024 [32:00:46<14:43:01, 162.02s/it][A
+                                                         [A{'loss': 0.0428, 'grad_norm': 0.003425017697736621, 'learning_rate': 1e-05, 'num_tokens': 616159416.0, 'completions/mean_length': 6129.9140625, 'completions/min_length': 1201.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5967.1513671875, 'completions/min_terminated_length': 1201.0, 'completions/max_terminated_length': 14713.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2188364714384079, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01828661933541298, 'sampling/sampling_logp_difference/max': 5.187221050262451, 'sampling/importance_sampling_ratio/min': 0.005587513092905283, 'sampling/importance_sampling_ratio/mean': 0.9999443292617798, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7654511705040932, 'clip_ratio/low_mean': 5.3280599786376115e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.3280599786376115e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 697/1024 [32:00:46<14:43:01, 162.02s/it][AINFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 698/1024 [32:03:31<14:45:22, 162.95s/it][A
+                                                         [A{'loss': 0.0607, 'grad_norm': 0.005707201547920704, 'learning_rate': 1e-05, 'num_tokens': 617101738.0, 'completions/mean_length': 7219.078125, 'completions/min_length': 649.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7146.91357421875, 'completions/min_terminated_length': 649.0, 'completions/max_terminated_length': 16340.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01983051374554634, 'sampling/sampling_logp_difference/max': 12.874998092651367, 'sampling/importance_sampling_ratio/min': 2.5612937406549463e-06, 'sampling/importance_sampling_ratio/mean': 0.9999914765357971, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.847568191587925, 'clip_ratio/low_mean': 3.4785461366482195e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.459671456264914e-06, 'clip_ratio/high_max': 2.1838685825059656e-05, 'clip_ratio/region_mean': 4.024513225431292e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 698/1024 [32:03:31<14:45:22, 162.95s/it][AINFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 699/1024 [32:06:05<14:27:07, 160.09s/it][A
+                                                         [A{'loss': 0.078, 'grad_norm': 0.004018646199256182, 'learning_rate': 1e-05, 'num_tokens': 617903030.0, 'completions/mean_length': 6116.96875, 'completions/min_length': 1371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5870.56005859375, 'completions/min_terminated_length': 1371.0, 'completions/max_terminated_length': 14972.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2569621503353119, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017638593912124634, 'sampling/sampling_logp_difference/max': 8.749999046325684, 'sampling/importance_sampling_ratio/min': 0.00015846146561671048, 'sampling/importance_sampling_ratio/mean': 0.9999732971191406, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7148991823196411, 'clip_ratio/low_mean': 5.492671812135086e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.55213056638604e-06, 'clip_ratio/high_max': 2.676450185390422e-05, 'clip_ratio/region_mean': 6.347884914248425e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 699/1024 [32:06:05<14:27:07, 160.09s/it][AINFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 700/1024 [32:08:57<14:44:19, 163.76s/it][A
+                                                         [A{'loss': 0.0617, 'grad_norm': 0.00282766274176538, 'learning_rate': 1e-05, 'num_tokens': 618880312.0, 'completions/mean_length': 7486.515625, 'completions/min_length': 611.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7272.9765625, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 15232.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32089442014694214, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01940794661641121, 'sampling/sampling_logp_difference/max': 9.180124282836914, 'sampling/importance_sampling_ratio/min': 0.0001030677231028676, 'sampling/importance_sampling_ratio/mean': 0.9999787211418152, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7912377193570137, 'clip_ratio/low_mean': 7.103690825260855e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9004990008397726e-06, 'clip_ratio/high_max': 3.844970706268214e-06, 'clip_ratio/region_mean': 7.29374083903167e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 700/1024 [32:08:57<14:44:19, 163.76s/it][AINFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 701/1024 [32:12:07<15:24:30, 171.73s/it][A
+                                                         [A{'loss': 0.0438, 'grad_norm': 0.0016839519375935197, 'learning_rate': 1e-05, 'num_tokens': 619834002.0, 'completions/mean_length': 7297.453125, 'completions/min_length': 743.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6610.23583984375, 'completions/min_terminated_length': 743.0, 'completions/max_terminated_length': 13644.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019356656819581985, 'sampling/sampling_logp_difference/max': 7.59285831451416, 'sampling/importance_sampling_ratio/min': 0.0005040382966399193, 'sampling/importance_sampling_ratio/mean': 0.9999658465385437, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8420139253139496, 'clip_ratio/low_mean': 3.103233757428825e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.616161613237637e-06, 'clip_ratio/high_max': 2.241842275907402e-05, 'clip_ratio/region_mean': 3.76484995285864e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 701/1024 [32:12:07<15:24:30, 171.73s/it][AINFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 21:38:49,411 - math_verify.grader - WARNING - Timeout during comparison
+
+ 69%|██████▊   | 702/1024 [32:14:48<15:03:21, 168.33s/it][A
+                                                         [A{'loss': 0.0822, 'grad_norm': 0.00550073804333806, 'learning_rate': 1e-05, 'num_tokens': 620615054.0, 'completions/mean_length': 5935.53125, 'completions/min_length': 632.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5684.76806640625, 'completions/min_terminated_length': 632.0, 'completions/max_terminated_length': 15471.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3366856575012207, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01712688058614731, 'sampling/sampling_logp_difference/max': 10.624999046325684, 'sampling/importance_sampling_ratio/min': 2.4300854420289397e-05, 'sampling/importance_sampling_ratio/mean': 1.0000221729278564, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6855737417936325, 'clip_ratio/low_mean': 4.7923438614816405e-05, 'clip_ratio/low_min': 3.219243353669299e-06, 'clip_ratio/high_mean': 2.447962742735399e-06, 'clip_ratio/high_max': 9.791850970941596e-06, 'clip_ratio/region_mean': 5.0371401357551804e-05, 'epoch': 0.65}
+
+ 69%|██████▊   | 702/1024 [32:14:48<15:03:21, 168.33s/it][AINFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▊   | 703/1024 [32:17:30<14:51:09, 166.57s/it][A
+                                                         [A{'loss': 0.0922, 'grad_norm': 0.005174044985324144, 'learning_rate': 1e-05, 'num_tokens': 621407854.0, 'completions/mean_length': 6016.0625, 'completions/min_length': 986.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5851.4921875, 'completions/min_terminated_length': 986.0, 'completions/max_terminated_length': 14395.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.25330984592437744, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017704609781503677, 'sampling/sampling_logp_difference/max': 10.249993324279785, 'sampling/importance_sampling_ratio/min': 3.535773794283159e-05, 'sampling/importance_sampling_ratio/mean': 0.9999493956565857, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7685846760869026, 'clip_ratio/low_mean': 2.6169475859205704e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3923624869203195e-06, 'clip_ratio/high_max': 1.3569449947681278e-05, 'clip_ratio/region_mean': 2.95618385734997e-05, 'epoch': 0.65}
+
+ 69%|██████▊   | 703/1024 [32:17:30<14:51:09, 166.57s/it][AINFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▉   | 704/1024 [32:20:09<14:36:03, 164.26s/it][A
+                                                         [A{'loss': 0.1083, 'grad_norm': 0.0022989478893578053, 'learning_rate': 1e-05, 'num_tokens': 622246633.0, 'completions/mean_length': 6402.3984375, 'completions/min_length': 443.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6323.80322265625, 'completions/min_terminated_length': 443.0, 'completions/max_terminated_length': 15771.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32089439034461975, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01977568492293358, 'sampling/sampling_logp_difference/max': 14.645465850830078, 'sampling/importance_sampling_ratio/min': 4.360687739790592e-07, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8285454586148262, 'clip_ratio/low_mean': 3.712984198500635e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2330010551740997e-06, 'clip_ratio/high_max': 8.932004220696399e-06, 'clip_ratio/region_mean': 3.936284304018045e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 704/1024 [32:20:09<14:36:03, 164.26s/it][AINFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 705/1024 [32:22:47<14:23:54, 162.49s/it][A
+                                                         [A{'loss': 0.0586, 'grad_norm': 0.0039679198525846004, 'learning_rate': 1e-05, 'num_tokens': 623047420.0, 'completions/mean_length': 6085.7734375, 'completions/min_length': 559.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5838.6162109375, 'completions/min_terminated_length': 559.0, 'completions/max_terminated_length': 16192.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2477683573961258, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01756519451737404, 'sampling/sampling_logp_difference/max': 10.26925277709961, 'sampling/importance_sampling_ratio/min': 3.468328213784844e-05, 'sampling/importance_sampling_ratio/mean': 0.9999336004257202, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.788465715944767, 'clip_ratio/low_mean': 3.0628862987214234e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1193895943506504e-06, 'clip_ratio/high_max': 4.477558377402602e-06, 'clip_ratio/region_mean': 3.174825269525172e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 705/1024 [32:22:47<14:23:54, 162.49s/it][AINFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▉   | 706/1024 [32:25:13<13:54:52, 157.52s/it][A
+                                                         [A{'loss': 0.0116, 'grad_norm': 0.005460201762616634, 'learning_rate': 1e-05, 'num_tokens': 623879902.0, 'completions/mean_length': 6338.265625, 'completions/min_length': 757.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6259.16552734375, 'completions/min_terminated_length': 757.0, 'completions/max_terminated_length': 15766.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02087930217385292, 'sampling/sampling_logp_difference/max': 4.832030773162842, 'sampling/importance_sampling_ratio/min': 0.007970319129526615, 'sampling/importance_sampling_ratio/mean': 1.000030279159546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9556885957717896, 'clip_ratio/low_mean': 1.3522747963179427e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.254188070262899e-06, 'clip_ratio/high_max': 1.241185282196966e-05, 'clip_ratio/region_mean': 1.7776936260816e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 706/1024 [32:25:13<13:54:52, 157.52s/it][AINFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 21:52:02,228 - math_verify.grader - WARNING - Timeout during comparison
+
+ 69%|██████▉   | 707/1024 [32:28:06<14:16:36, 162.14s/it][A
+                                                         [A{'loss': 0.0485, 'grad_norm': 0.0026750562246888876, 'learning_rate': 1e-05, 'num_tokens': 624797851.0, 'completions/mean_length': 6995.6015625, 'completions/min_length': 1643.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6846.57958984375, 'completions/min_terminated_length': 1643.0, 'completions/max_terminated_length': 15728.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.14123955368995667, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.021924620494246483, 'sampling/sampling_logp_difference/max': 9.499988555908203, 'sampling/importance_sampling_ratio/min': 7.485268724849448e-05, 'sampling/importance_sampling_ratio/mean': 1.000006079673767, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0373736545443535, 'clip_ratio/low_mean': 3.0928131309337914e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.542219234186632e-07, 'clip_ratio/high_max': 3.416887693674653e-06, 'clip_ratio/region_mean': 3.178235323275658e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 707/1024 [32:28:06<14:16:36, 162.14s/it][AINFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▉   | 708/1024 [32:31:19<15:02:33, 171.37s/it][A
+                                                         [A{'loss': 0.0458, 'grad_norm': 0.0032085489947348833, 'learning_rate': 1e-05, 'num_tokens': 625759543.0, 'completions/mean_length': 7361.28125, 'completions/min_length': 832.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 6341.3212890625, 'completions/min_terminated_length': 832.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.27062684297561646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018720708787441254, 'sampling/sampling_logp_difference/max': 6.406182765960693, 'sampling/importance_sampling_ratio/min': 0.001651315949857235, 'sampling/importance_sampling_ratio/mean': 1.0000088214874268, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8065696209669113, 'clip_ratio/low_mean': 5.126845326230978e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.276005024119513e-06, 'clip_ratio/high_max': 2.134235910489224e-05, 'clip_ratio/region_mean': 5.754445828642929e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 708/1024 [32:31:19<15:02:33, 171.37s/it][AINFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▉   | 709/1024 [32:33:56<14:36:38, 166.98s/it][A
+                                                         [A{'loss': 0.0283, 'grad_norm': 0.0034961337223649025, 'learning_rate': 1e-05, 'num_tokens': 626602944.0, 'completions/mean_length': 6415.4453125, 'completions/min_length': 890.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6336.95263671875, 'completions/min_terminated_length': 890.0, 'completions/max_terminated_length': 15793.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2709311842918396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02052130736410618, 'sampling/sampling_logp_difference/max': 11.249995231628418, 'sampling/importance_sampling_ratio/min': 1.3007359484618064e-05, 'sampling/importance_sampling_ratio/mean': 0.9999567866325378, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9348134994506836, 'clip_ratio/low_mean': 3.820702841039747e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.349654664612899e-06, 'clip_ratio/high_max': 9.398618658451596e-06, 'clip_ratio/region_mean': 4.055668296132353e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 709/1024 [32:33:56<14:36:38, 166.98s/it][AINFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▉   | 710/1024 [32:37:01<15:02:02, 172.36s/it][A
+                                                         [A{'loss': 0.0559, 'grad_norm': 0.004416701849550009, 'learning_rate': 1e-05, 'num_tokens': 627629595.0, 'completions/mean_length': 7879.3359375, 'completions/min_length': 592.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7079.75244140625, 'completions/min_terminated_length': 592.0, 'completions/max_terminated_length': 15440.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.31930169463157654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018000833690166473, 'sampling/sampling_logp_difference/max': 8.248465538024902, 'sampling/importance_sampling_ratio/min': 0.0002616597630549222, 'sampling/importance_sampling_ratio/mean': 0.9999274015426636, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7092025354504585, 'clip_ratio/low_mean': 5.279648712530616e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.977412849413668e-06, 'clip_ratio/high_max': 3.190965139765467e-05, 'clip_ratio/region_mean': 6.077389980418957e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 710/1024 [32:37:01<15:02:02, 172.36s/it][AINFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▉   | 711/1024 [32:39:38<14:35:45, 167.88s/it][A
+                                                         [A{'loss': 0.0813, 'grad_norm': 0.0047090682201087475, 'learning_rate': 1e-05, 'num_tokens': 628409064.0, 'completions/mean_length': 5936.2890625, 'completions/min_length': 491.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5685.54443359375, 'completions/min_terminated_length': 491.0, 'completions/max_terminated_length': 14801.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01985335350036621, 'sampling/sampling_logp_difference/max': 7.311888217926025, 'sampling/importance_sampling_ratio/min': 0.0006675553740933537, 'sampling/importance_sampling_ratio/mean': 0.9999631643295288, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8411448448896408, 'clip_ratio/low_mean': 5.0108070581700304e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.319128604241996e-06, 'clip_ratio/high_max': 2.1276514416967984e-05, 'clip_ratio/region_mean': 5.54271991859423e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 711/1024 [32:39:38<14:35:45, 167.88s/it][AINFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache
diff --git a/grpo_dora_7b_20251202_013940/README.md b/grpo_dora_7b_20251202_013940/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2610c1d81840332bfdf7b5c84963fa9d35c5ec12
--- /dev/null
+++ b/grpo_dora_7b_20251202_013940/README.md
@@ -0,0 +1,68 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+library_name: transformers
+model_name: dapo_dora_7b_20251202_013940
+tags:
+- generated_from_trainer
+- grpo
+- trl
+licence: license
+---
+
+# Model Card for dapo_dora_7b_20251202_013940
+
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/6dmxhs58) 
+
+
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+
+### Framework versions
+
+- TRL: 0.25.0
+- Transformers: 4.57.1
+- Pytorch: 2.8.0
+- Datasets: 4.4.1
+- Tokenizers: 0.22.1
+
+## Citations
+
+Cite GRPO as:
+
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+
+```
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/grpo_dora_7b_20251202_013940/output.log b/grpo_dora_7b_20251202_013940/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..5ff56981d2e32cb709ef86cbe9e4c3203d93103e
--- /dev/null
+++ b/grpo_dora_7b_20251202_013940/output.log
@@ -0,0 +1,4721 @@
+W1202 01:40:02.367000 101639 torch/distributed/run.py:774] 
+W1202 01:40:02.367000 101639 torch/distributed/run.py:774] *****************************************
+W1202 01:40:02.367000 101639 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W1202 01:40:02.367000 101639 torch/distributed/run.py:774] *****************************************
+INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda.
+INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda.
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-02 01:40:30,218 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it
+[OpenTinker] 2025-12-02 01:40:30,218 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it
+[OpenTinker] 2025-12-02 01:40:30,218 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-12-02 01:40:30,221 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: setting up run i7o9cken
+wandb: setting up run jco9wivt
+wandb: setting up run 6dmxhs58
+wandb: setting up run 9u4d73kf
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-i7o9cken
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_dora_7b_20251202_013940
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/i7o9cken
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-jco9wivt
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_dora_7b_20251202_013940
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/jco9wivt
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 01:40:35,900 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 01:40:35,900 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-6dmxhs58
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_dora_7b_20251202_013940
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/6dmxhs58
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 01:40:36,151 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 01:40:36,151 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-9u4d73kf
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dapo_dora_7b_20251202_013940
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/9u4d73kf
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 01:40:36,297 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 01:40:36,297 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-12-02 01:40:36,446 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-12-02 01:40:36,447 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+[OpenTinker] 2025-12-02 01:40:37,277 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 01:40:37,397 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 01:40:37,473 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 01:40:37,656 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-12-02 01:40:40,347 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+[OpenTinker] 2025-12-02 01:40:40,404 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+[OpenTinker] 2025-12-02 01:40:40,483 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+[OpenTinker] 2025-12-02 01:40:40,573 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
+
+
+Loading checkpoint shards:  50%|█████     | 1/2 [00:03<00:03,  3.44s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:03<00:03,  3.43s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:03<00:03,  3.44s/it][A[A[A
+Loading checkpoint shards:  50%|█████     | 1/2 [00:03<00:03,  3.44s/it][A
+
+
+
+Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.71s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.71s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.71s/it][ALoading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.71s/it][A[A[ALoading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.82s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.82s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.82s/it]
+
+Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.82s/it]
+
+[OpenTinker] 2025-12-02 01:40:47,415 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 01:40:47,416 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 01:40:47,431 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 01:40:47,432 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 01:40:47,602 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 01:40:47,602 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 01:40:47,807 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 01:40:47,810 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-12-02 01:40:47,810 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-12-02 01:40:47,826 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 01:40:47,967 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 01:40:48,194 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-12-02 01:40:48,328 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpvdeqtfm6/test.c -o /tmp/tmpvdeqtfm6/test.o
+[OpenTinker] 2025-12-02 01:40:48,341 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpgrfhufhd/test.c -o /tmp/tmpgrfhufhd/test.o
+[OpenTinker] 2025-12-02 01:40:48,341 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp2ovj9q8y/test.c -o /tmp/tmp2ovj9q8y/test.o
+[OpenTinker] 2025-12-02 01:40:48,359 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpvdeqtfm6/test.o -laio -o /tmp/tmpvdeqtfm6/a.out
+[OpenTinker] 2025-12-02 01:40:48,376 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpgrfhufhd/test.o -laio -o /tmp/tmpgrfhufhd/a.out
+[OpenTinker] 2025-12-02 01:40:48,387 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp2ovj9q8y/test.o -laio -o /tmp/tmp2ovj9q8y/a.out
+[OpenTinker] 2025-12-02 01:40:48,548 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmplg6pb6s0/test.c -o /tmp/tmplg6pb6s0/test.o
+[OpenTinker] 2025-12-02 01:40:48,577 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmplg6pb6s0/test.o -laio -o /tmp/tmplg6pb6s0/a.out
+[OpenTinker] 2025-12-02 01:40:48,811 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp2sthw5q5/test.c -o /tmp/tmp2sthw5q5/test.o
+[OpenTinker] 2025-12-02 01:40:48,839 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp2sthw5q5/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp2sthw5q5/a.out
+[OpenTinker] 2025-12-02 01:40:48,933 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpm1pqqhde/test.c -o /tmp/tmpm1pqqhde/test.o
+[OpenTinker] 2025-12-02 01:40:48,947 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpei2uy0jc/test.c -o /tmp/tmpei2uy0jc/test.o
+[OpenTinker] 2025-12-02 01:40:48,961 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpm1pqqhde/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpm1pqqhde/a.out
+[OpenTinker] 2025-12-02 01:40:48,976 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpei2uy0jc/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpei2uy0jc/a.out
+[OpenTinker] 2025-12-02 01:40:49,057 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpx1alfpxm/test.c -o /tmp/tmpx1alfpxm/test.o
+[OpenTinker] 2025-12-02 01:40:49,081 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpx1alfpxm/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpx1alfpxm/a.out
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO cudaDriverVersion 12090
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0>
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Initialized NET plugin Socket
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO ncclCommInitRankConfig comm 0x1f8d2890 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x1a9a18bbe7484081 - Init START
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e401220 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x1a9a18bbe7484081 - Init START
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO ncclCommInitRankConfig comm 0x1e31d2c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x1a9a18bbe7484081 - Init START
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1c5c60 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x1a9a18bbe7484081 - Init START
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Bootstrap timings total 0.001227 (create 0.000018, send 0.000094, recv 0.000685, ring 0.000150, delay 0.000000)
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Bootstrap timings total 0.005961 (create 0.000022, send 0.000079, recv 0.004801, ring 0.000737, delay 0.000001)
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Bootstrap timings total 0.000679 (create 0.000020, send 0.000094, recv 0.000180, ring 0.000067, delay 0.000000)
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Bootstrap timings total 0.003019 (create 0.000020, send 0.000097, recv 0.000073, ring 0.000096, delay 0.000000)
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO comm 0x1f1c5c60 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO comm 0x1e401220 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO comm 0x1f8d2890 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO comm 0x1e31d2c0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:101885:102984 [2] NCCL INFO [Proxy Service] Device 2 CPU core 151
+lshn-qs-g2ri-2:101885:102985 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 69
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-g2ri-2:101883:102987 [0] NCCL INFO [Proxy Service] Device 0 CPU core 166
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-g2ri-2:101886:102986 [3] NCCL INFO [Proxy Service] Device 3 CPU core 87
+lshn-qs-g2ri-2:101886:102988 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 90
+lshn-qs-g2ri-2:101883:102989 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 169
+lshn-qs-g2ri-2:101884:102990 [1] NCCL INFO [Proxy Service] Device 1 CPU core 147
+lshn-qs-g2ri-2:101884:102991 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 170
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1c5c60 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x1a9a18bbe7484081 - Init COMPLETE
+lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.90 (kernels 0.16, alloc 0.57, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.08, rest 0.04)
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO ncclCommInitRankConfig comm 0x1e31d2c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x1a9a18bbe7484081 - Init COMPLETE
+lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.89 (kernels 0.16, alloc 0.57, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.08, rest 0.04)
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO ncclCommInitRankConfig comm 0x1f8d2890 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x1a9a18bbe7484081 - Init COMPLETE
+lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.92 (kernels 0.16, alloc 0.59, bootstrap 0.01, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.08, rest 0.04)
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e401220 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x1a9a18bbe7484081 - Init COMPLETE
+lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.90 (kernels 0.16, alloc 0.57, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.09, rest 0.03)
+[OpenTinker] 2025-12-02 01:40:52,806 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 01:40:52,809 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 01:40:52,812 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-12-02 01:40:52,827 - root - INFO - Training model with GRPO
+INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'}
+INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'}
+INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'}
+INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'}
+INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896
+INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896
+INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896
+INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896
+INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null}
+INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null}
+INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null}
+INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null}
+[rank2]:[W1202 01:41:14.565456558 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+[rank1]:[W1202 01:41:14.662468277 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+[rank0]:[W1202 01:41:14.662551621 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+[rank3]:[W1202 01:41:14.682452457 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO ncclCommSplit comm 0x20f332d0 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 1 color 2003953581 key 0- Init START
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO ncclCommSplit comm 0x20250b20 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 1 color 2003953581 key 2- Init START
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO ncclCommSplit comm 0x1f3cce90 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 1 color 2003953581 key 1- Init START
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO ncclCommSplit comm 0x1f52d880 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 1 color 2003953581 key 3- Init START
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO comm 0x20250b20 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO comm 0x1f3cce90 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO comm 0x20f332d0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO comm 0x1f52d880 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103132 [1] NCCL INFO [Proxy Service] Device 1 CPU core 54
+lshn-qs-g2ri-2:101884:103133 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 65
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-g2ri-2:101883:103134 [0] NCCL INFO [Proxy Service] Device 0 CPU core 181
+lshn-qs-g2ri-2:101883:103135 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 182
+lshn-qs-g2ri-2:101885:103136 [2] NCCL INFO [Proxy Service] Device 2 CPU core 183
+lshn-qs-g2ri-2:101885:103137 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 90
+lshn-qs-g2ri-2:101886:103138 [3] NCCL INFO [Proxy Service] Device 3 CPU core 91
+lshn-qs-g2ri-2:101886:103139 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 188
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO ncclCommSplit comm 0x20250b20 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 1 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO ncclCommSplit comm 0x20f332d0 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 1 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO ncclCommSplit comm 0x1f3cce90 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 1 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO ncclCommSplit comm 0x1f52d880 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 1 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.20 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.13)
+lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.11 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.03)
+lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.11 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.03)
+lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.01)
+[Gloo] Rank 1 is connected to 3[Gloo] Rank 0 peer ranks. Expected number of connected peer ranks is :  is connected to 33
+ peer ranks. Expected number of connected peer ranks is : [Gloo] Rank [Gloo] Rank 23
+3 is connected to  is connected to 33 peer ranks.  peer ranks. Expected number of connected peer ranks is : Expected number of connected peer ranks is : 33
+
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO ncclCommSplit comm 0x21044ab0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 2 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO comm 0x21044ab0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101883:103162 [0] NCCL INFO [Proxy Service] Device 0 CPU core 51
+lshn-qs-g2ri-2:101883:103163 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 73
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO ncclCommSplit comm 0x21044ab0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 2 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO ncclCommSplit comm 0x1f4e1200 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 4 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO comm 0x1f4e1200 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101884:103177 [1] NCCL INFO [Proxy Service] Device 1 CPU core 170
+lshn-qs-g2ri-2:101884:103178 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 177
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO ncclCommSplit comm 0x1f4e1200 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 4 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO ncclCommSplit comm 0x203652a0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 6 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO comm 0x203652a0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101885:103192 [2] NCCL INFO [Proxy Service] Device 2 CPU core 176
+lshn-qs-g2ri-2:101885:103193 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 190
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO ncclCommSplit comm 0x203652a0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 6 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO ncclCommSplit comm 0x1f641820 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 8 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO comm 0x1f641820 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101886:103209 [3] NCCL INFO [Proxy Service] Device 3 CPU core 62
+lshn-qs-g2ri-2:101886:103210 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 65
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO ncclCommSplit comm 0x1f641820 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 8 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO ncclCommSplit comm 0x227a2cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 9 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO comm 0x227a2cc0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101883:103218 [0] NCCL INFO [Proxy Service] Device 0 CPU core 64
+lshn-qs-g2ri-2:101883:103219 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 58
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO ncclCommSplit comm 0x227a2cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 9 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO ncclCommSplit comm 0x20c3aa70 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 11 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO comm 0x20c3aa70 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101884:103233 [1] NCCL INFO [Proxy Service] Device 1 CPU core 183
+lshn-qs-g2ri-2:101884:103234 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 165
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO ncclCommSplit comm 0x20c3aa70 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 11 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO ncclCommSplit comm 0x21ac25f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 13 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO comm 0x21ac25f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101885:103248 [2] NCCL INFO [Proxy Service] Device 2 CPU core 78
+lshn-qs-g2ri-2:101885:103249 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 170
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO ncclCommSplit comm 0x21ac25f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 13 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO ncclCommSplit comm 0x20d8aea0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 15 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO comm 0x20d8aea0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101886:103265 [3] NCCL INFO [Proxy Service] Device 3 CPU core 168
+lshn-qs-g2ri-2:101886:103266 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 66
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO ncclCommSplit comm 0x20d8aea0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 15 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO ncclCommSplit comm 0x228aa8d0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 16 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO comm 0x228aa8d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101883:103274 [0] NCCL INFO [Proxy Service] Device 0 CPU core 54
+lshn-qs-g2ri-2:101883:103275 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 67
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO ncclCommSplit comm 0x228aa8d0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 16 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO ncclCommSplit comm 0x20d42680 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 18 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO comm 0x20d42680 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101884:103289 [1] NCCL INFO [Proxy Service] Device 1 CPU core 79
+lshn-qs-g2ri-2:101884:103290 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 169
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO ncclCommSplit comm 0x20d42680 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 18 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO ncclCommSplit comm 0x21bca200 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 20 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO comm 0x21bca200 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101885:103304 [2] NCCL INFO [Proxy Service] Device 2 CPU core 177
+lshn-qs-g2ri-2:101885:103305 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 188
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO ncclCommSplit comm 0x21bca200 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 20 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.08, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO ncclCommSplit comm 0x20e92ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 22 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO comm 0x20e92ab0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101886:103321 [3] NCCL INFO [Proxy Service] Device 3 CPU core 58
+lshn-qs-g2ri-2:101886:103322 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 180
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO ncclCommSplit comm 0x20e92ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 22 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO ncclCommSplit comm 0x229b24e0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 23 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO comm 0x229b24e0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101883:103330 [0] NCCL INFO [Proxy Service] Device 0 CPU core 183
+lshn-qs-g2ri-2:101883:103331 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 165
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO ncclCommSplit comm 0x229b24e0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 23 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO ncclCommSplit comm 0x20e4a290 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 25 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO comm 0x20e4a290 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101884:103345 [1] NCCL INFO [Proxy Service] Device 1 CPU core 57
+lshn-qs-g2ri-2:101884:103346 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 60
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO ncclCommSplit comm 0x20e4a290 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 25 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO ncclCommSplit comm 0x21cd1e10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 27 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO comm 0x21cd1e10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101885:103360 [2] NCCL INFO [Proxy Service] Device 2 CPU core 86
+lshn-qs-g2ri-2:101885:103361 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 190
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO ncclCommSplit comm 0x21cd1e10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 27 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO ncclCommSplit comm 0x20f9a6c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 29 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO comm 0x20f9a6c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101886:103377 [3] NCCL INFO [Proxy Service] Device 3 CPU core 50
+lshn-qs-g2ri-2:101886:103378 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 54
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO ncclCommSplit comm 0x20f9a6c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 29 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO ncclCommSplit comm 0x22aba0f0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 30 color 59908776 key 0- Init START
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO comm 0x22aba0f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101883:103386 [0] NCCL INFO [Proxy Service] Device 0 CPU core 67
+lshn-qs-g2ri-2:101883:103387 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 177
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO ncclCommSplit comm 0x22aba0f0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 30 color 59908776 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO ncclCommSplit comm 0x20f51ea0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 32 color 440515407 key 0- Init START
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO comm 0x20f51ea0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101884:103401 [1] NCCL INFO [Proxy Service] Device 1 CPU core 181
+lshn-qs-g2ri-2:101884:103402 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 84
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO ncclCommSplit comm 0x20f51ea0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 32 color 440515407 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.18 (kernels 0.00, alloc 0.04, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.12, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO ncclCommSplit comm 0x21dd9a20 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 34 color 1227022723 key 0- Init START
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO comm 0x21dd9a20 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101885:103416 [2] NCCL INFO [Proxy Service] Device 2 CPU core 172
+lshn-qs-g2ri-2:101885:103417 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 72
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO ncclCommSplit comm 0x21dd9a20 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 34 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Using network Socket
+INFO 12-02 01:41:15 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 01:41:15 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 01:41:15 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO ncclCommSplit comm 0x210a22d0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 36 color 1301067556 key 0- Init START
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO comm 0x210a22d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-g2ri-2:101886:103428 [3] NCCL INFO [Proxy Service] Device 3 CPU core 158
+lshn-qs-g2ri-2:101886:103429 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 68
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO ncclCommSplit comm 0x210a22d0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 36 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+INFO 12-02 01:41:15 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B...
+INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B...
+INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B...
+INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B...
+INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors']
+
+Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
+[A
+Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:05<00:05,  5.12s/it]
+[A
+Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:10<00:00,  5.29s/it]
+[ALoading safetensors checkpoint shards: 100% Completed | 2/2 [00:10<00:00,  5.26s/it]
+
+INFO 12-02 01:41:29 [default_loader.py:268] Loading weights took 9.15 seconds
+INFO 12-02 01:41:29 [gpu_model_runner.py:2392] Model loading took 14.2717 GiB and 12.992463 seconds
+INFO 12-02 01:41:30 [default_loader.py:268] Loading weights took 10.34 seconds
+INFO 12-02 01:41:30 [default_loader.py:268] Loading weights took 12.06 seconds
+INFO 12-02 01:41:30 [default_loader.py:268] Loading weights took 11.19 seconds
+INFO 12-02 01:41:30 [gpu_model_runner.py:2392] Model loading took 14.2717 GiB and 13.646185 seconds
+INFO 12-02 01:41:30 [gpu_model_runner.py:2392] Model loading took 14.2717 GiB and 13.648457 seconds
+INFO 12-02 01:41:30 [gpu_model_runner.py:2392] Model loading took 14.2717 GiB and 13.603632 seconds
+INFO 12-02 01:41:36 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/ce3af371c3/rank_1_0/backbone for vLLM's torch.compile
+INFO 12-02 01:41:36 [backends.py:550] Dynamo bytecode transform time: 5.89 s
+INFO 12-02 01:41:36 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/ce3af371c3/rank_2_0/backbone for vLLM's torch.compile
+INFO 12-02 01:41:36 [backends.py:550] Dynamo bytecode transform time: 5.71 s
+INFO 12-02 01:41:36 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/ce3af371c3/rank_0_0/backbone for vLLM's torch.compile
+INFO 12-02 01:41:36 [backends.py:550] Dynamo bytecode transform time: 5.67 s
+INFO 12-02 01:41:36 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/ce3af371c3/rank_3_0/backbone for vLLM's torch.compile
+INFO 12-02 01:41:36 [backends.py:550] Dynamo bytecode transform time: 5.73 s
+INFO 12-02 01:41:49 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 13.359 s
+INFO 12-02 01:41:50 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 13.460 s
+INFO 12-02 01:41:50 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 14.003 s
+INFO 12-02 01:41:50 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 14.216 s
+INFO 12-02 01:41:52 [monitor.py:34] torch.compile takes 5.89 s in total
+INFO 12-02 01:41:53 [monitor.py:34] torch.compile takes 5.67 s in total
+INFO 12-02 01:41:53 [gpu_worker.py:298] Available KV cache memory: 38.86 GiB
+INFO 12-02 01:41:53 [gpu_worker.py:298] Available KV cache memory: 38.86 GiB
+INFO 12-02 01:41:54 [monitor.py:34] torch.compile takes 5.73 s in total
+INFO 12-02 01:41:54 [monitor.py:34] torch.compile takes 5.71 s in total
+INFO 12-02 01:41:54 [gpu_worker.py:298] Available KV cache memory: 38.86 GiB
+INFO 12-02 01:41:54 [gpu_worker.py:298] Available KV cache memory: 38.86 GiB
+INFO 12-02 01:41:55 [kv_cache_utils.py:864] GPU KV cache size: 727,664 tokens
+INFO 12-02 01:41:55 [kv_cache_utils.py:864] GPU KV cache size: 727,664 tokens
+INFO 12-02 01:41:55 [kv_cache_utils.py:864] GPU KV cache size: 727,664 tokens
+INFO 12-02 01:41:55 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 43.07x
+INFO 12-02 01:41:55 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 43.07x
+INFO 12-02 01:41:55 [kv_cache_utils.py:864] GPU KV cache size: 727,664 tokens
+INFO 12-02 01:41:55 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 43.07x
+INFO 12-02 01:41:55 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 43.07x
+
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/7 [00:00<?, ?it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  43%|████▎     | 3/7 [00:00<00:00, 29.43it/s][ACapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 7/7 [00:00<00:00, 34.83it/s]
+INFO 12-02 01:41:55 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.13 GiB
+INFO 12-02 01:41:55 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 14.27 GiB for weight, 0.63 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.13 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=41429755289` to fit into requested memory, or `--kv-cache-memory=130717273088` to fully utilize gpu memory. Current kv cache memory in use is 41727550873 bytes.
+INFO 12-02 01:41:55 [core.py:218] init engine (profile, create kv cache, warmup model) took 25.34 seconds
+INFO 12-02 01:41:55 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.13 GiB
+INFO 12-02 01:41:55 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 14.27 GiB for weight, 0.63 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.13 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=41429755289` to fit into requested memory, or `--kv-cache-memory=130717273088` to fully utilize gpu memory. Current kv cache memory in use is 41727550873 bytes.
+INFO 12-02 01:41:55 [core.py:218] init engine (profile, create kv cache, warmup model) took 25.30 seconds
+INFO 12-02 01:41:56 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.13 GiB
+INFO 12-02 01:41:56 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 14.27 GiB for weight, 0.63 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.13 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=41429755289` to fit into requested memory, or `--kv-cache-memory=130717273088` to fully utilize gpu memory. Current kv cache memory in use is 41727550873 bytes.
+INFO 12-02 01:41:56 [core.py:218] init engine (profile, create kv cache, warmup model) took 25.52 seconds
+INFO 12-02 01:41:56 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.13 GiB
+INFO 12-02 01:41:56 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 14.27 GiB for weight, 0.63 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.13 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=41429755289` to fit into requested memory, or `--kv-cache-memory=130717273088` to fully utilize gpu memory. Current kv cache memory in use is 41727550873 bytes.
+INFO 12-02 01:41:56 [core.py:218] init engine (profile, create kv cache, warmup model) took 26.40 seconds
+INFO 12-02 01:41:56 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 01:41:56 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 00/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM
+INFO 12-02 01:41:56 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 01:41:56 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM
+INFO 12-02 01:41:57 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 01:41:57 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 00/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM
+INFO 12-02 01:41:57 [llm.py:295] Supported_tasks: ('generate',)
+INFO 12-02 01:41:57 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+[OpenTinker] 2025-12-02 01:42:00,702 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value.
+lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Using network Socket
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO ncclCommSplit comm 0x1c1970c0 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 37 color 2003953581 key 2- Init START
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO ncclCommSplit comm 0x55f32550 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 37 color 2003953581 key 0- Init START
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO ncclCommSplit comm 0x1b29dc00 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 37 color 2003953581 key 1- Init START
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO ncclCommSplit comm 0x1b438a80 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 37 color 2003953581 key 3- Init START
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO comm 0x1b29dc00 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO comm 0x55f32550 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO comm 0x1b438a80 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO comm 0x1c1970c0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-g2ri-2:101884:103571 [1] NCCL INFO [Proxy Service] Device 1 CPU core 68
+lshn-qs-g2ri-2:101884:103572 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 166
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-g2ri-2:101883:103573 [0] NCCL INFO [Proxy Service] Device 0 CPU core 146
+lshn-qs-g2ri-2:101883:103574 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 149
+lshn-qs-g2ri-2:101885:103575 [2] NCCL INFO [Proxy Service] Device 2 CPU core 168
+lshn-qs-g2ri-2:101885:103576 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 169
+lshn-qs-g2ri-2:101886:103577 [3] NCCL INFO [Proxy Service] Device 3 CPU core 77
+lshn-qs-g2ri-2:101886:103578 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 78
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO ncclCommSplit comm 0x1c1970c0 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 37 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO ncclCommSplit comm 0x55f32550 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 37 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO ncclCommSplit comm 0x1b29dc00 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 37 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO ncclCommSplit comm 0x1b438a80 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 37 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.44 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.37)
+lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.02)
+lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.33 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.26)
+lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.02)
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 00/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 00/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM
+lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+INFO 12-02 01:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:42:04 [block_pool.py:292] Successfully reset prefix cache
+wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
+
+  0%|          | 0/1024 [00:00<?, ?it/s][AINFO 12-02 01:42:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:42:10 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 01:42:16 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 01:42:16 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 12-02 01:42:20 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 1/1024 [04:45<81:00:16, 285.06s/it][A
+                                                    [A{'loss': 0.0764, 'grad_norm': 0.0027154693379998207, 'learning_rate': 1e-05, 'num_tokens': 372903.0, 'completions/mean_length': 5701.859375, 'completions/min_length': 630.0, 'completions/max_length': 16233.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5701.859375, 'completions/min_terminated_length': 630.0, 'completions/max_terminated_length': 16233.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.4581822156906128, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.01358163170516491, 'sampling/sampling_logp_difference/max': 1.290907621383667, 'sampling/importance_sampling_ratio/min': 0.2750210464000702, 'sampling/importance_sampling_ratio/mean': 1.0000617504119873, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.35103847086429596, 'clip_ratio/low_mean': 0.0, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 0.0, 'epoch': 0.0}
+
+  0%|          | 1/1024 [04:45<81:00:16, 285.06s/it][AINFO 12-02 01:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:46:50 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 2/1024 [09:56<85:17:17, 300.43s/it][A
+                                                    [A{'loss': -0.0397, 'grad_norm': 0.0009062179597094655, 'learning_rate': 1e-05, 'num_tokens': 801945.0, 'completions/mean_length': 6527.65625, 'completions/min_length': 393.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5692.373046875, 'completions/min_terminated_length': 393.0, 'completions/max_terminated_length': 15817.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.19044628739356995, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020587297156453133, 'sampling/sampling_logp_difference/max': 3.399496078491211, 'sampling/importance_sampling_ratio/min': 0.03339008986949921, 'sampling/importance_sampling_ratio/mean': 0.9999315142631531, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5323723964393139, 'clip_ratio/low_mean': 0.00011685759454849176, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.702819594764151e-05, 'clip_ratio/high_max': 0.00014811278379056603, 'clip_ratio/region_mean': 0.00015388579049613327, 'epoch': 0.0}
+
+  0%|          | 2/1024 [09:56<85:17:17, 300.43s/it][AINFO 12-02 01:52:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:52:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:52:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:52:01 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 3/1024 [14:11<79:21:53, 279.84s/it][A
+                                                    [A{'loss': 0.0258, 'grad_norm': 0.0021089769434183836, 'learning_rate': 1e-05, 'num_tokens': 1144006.0, 'completions/mean_length': 5189.328125, 'completions/min_length': 861.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5011.63525390625, 'completions/min_terminated_length': 861.0, 'completions/max_terminated_length': 15412.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.2540663480758667, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.016339149326086044, 'sampling/sampling_logp_difference/max': 1.677985668182373, 'sampling/importance_sampling_ratio/min': 0.18674977123737335, 'sampling/importance_sampling_ratio/mean': 0.9998490810394287, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.430219117552042, 'clip_ratio/low_mean': 9.659631132308277e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1630116659944179e-05, 'clip_ratio/high_max': 4.6520466639776714e-05, 'clip_ratio/region_mean': 0.00010822642798302695, 'epoch': 0.0}
+
+  0%|          | 3/1024 [14:11<79:21:53, 279.84s/it][AINFO 12-02 01:56:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:56:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:56:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:56:17 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 4/1024 [16:56<66:28:21, 234.61s/it][A
+                                                    [A{'loss': 0.0921, 'grad_norm': 0.002283274196088314, 'learning_rate': 1e-05, 'num_tokens': 1375319.0, 'completions/mean_length': 3405.140625, 'completions/min_length': 823.0, 'completions/max_length': 10854.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3405.140625, 'completions/min_terminated_length': 823.0, 'completions/max_terminated_length': 10854.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5039526224136353, 'reward': 0.5, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016809869557619095, 'sampling/sampling_logp_difference/max': 0.8619894981384277, 'sampling/importance_sampling_ratio/min': 0.4634940028190613, 'sampling/importance_sampling_ratio/mean': 0.9998574256896973, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5608247071504593, 'clip_ratio/low_mean': 0.00024632705572003033, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4321808723470895e-05, 'clip_ratio/high_max': 6.535875400004443e-05, 'clip_ratio/region_mean': 0.00027064886216976447, 'epoch': 0.0}
+
+  0%|          | 4/1024 [16:56<66:28:21, 234.61s/it][AINFO 12-02 01:59:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:59:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:59:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:59:02 [block_pool.py:292] Successfully reset prefix cache
+
+  0%|          | 5/1024 [22:07<74:07:47, 261.89s/it][A
+                                                    [A{'loss': 0.1194, 'grad_norm': 0.0019415807910263538, 'learning_rate': 1e-05, 'num_tokens': 1869429.0, 'completions/mean_length': 7561.59375, 'completions/min_length': 813.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 6478.140625, 'completions/min_terminated_length': 813.0, 'completions/max_terminated_length': 16113.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.5060110092163086, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.018491342663764954, 'sampling/sampling_logp_difference/max': 1.704564094543457, 'sampling/importance_sampling_ratio/min': 0.18185164034366608, 'sampling/importance_sampling_ratio/mean': 1.0000150203704834, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4615430533885956, 'clip_ratio/low_mean': 0.00038889441020728555, 'clip_ratio/low_min': 4.806927427125629e-05, 'clip_ratio/high_mean': 8.506695462529024e-05, 'clip_ratio/high_max': 0.00023204535409604432, 'clip_ratio/region_mean': 0.00047396136142197065, 'epoch': 0.0}
+
+  0%|          | 5/1024 [22:07<74:07:47, 261.89s/it][AINFO 12-02 02:04:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:04:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:04:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:04:12 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 6/1024 [26:36<74:45:29, 264.37s/it][A
+                                                    [A{'loss': 0.1078, 'grad_norm': 0.004854729399085045, 'learning_rate': 1e-05, 'num_tokens': 2173935.0, 'completions/mean_length': 4626.65625, 'completions/min_length': 489.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4440.0322265625, 'completions/min_terminated_length': 489.0, 'completions/max_terminated_length': 15357.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.328794926404953, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017279166728258133, 'sampling/sampling_logp_difference/max': 2.3973019123077393, 'sampling/importance_sampling_ratio/min': 0.09096305072307587, 'sampling/importance_sampling_ratio/mean': 0.9999671578407288, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4968189671635628, 'clip_ratio/low_mean': 0.0001498379247095727, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.987271444813814e-05, 'clip_ratio/high_max': 0.0001864949517766945, 'clip_ratio/region_mean': 0.00021971064234094229, 'epoch': 0.0}
+
+  1%|          | 6/1024 [26:36<74:45:29, 264.37s/it][AINFO 12-02 02:08:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:08:42 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 7/1024 [31:25<76:59:03, 272.51s/it][A
+                                                    [A{'loss': 0.1273, 'grad_norm': 0.0012780033284798265, 'learning_rate': 1e-05, 'num_tokens': 2636475.0, 'completions/mean_length': 7064.1875, 'completions/min_length': 1017.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6442.8671875, 'completions/min_terminated_length': 1017.0, 'completions/max_terminated_length': 14628.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.29826053977012634, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018940825015306473, 'sampling/sampling_logp_difference/max': 1.7880760431289673, 'sampling/importance_sampling_ratio/min': 0.16728170216083527, 'sampling/importance_sampling_ratio/mean': 1.000023603439331, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48472268879413605, 'clip_ratio/low_mean': 0.00022404171659218264, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3911318041646155e-05, 'clip_ratio/high_max': 0.00013564527216658462, 'clip_ratio/region_mean': 0.0002579530337243341, 'epoch': 0.0}
+
+  1%|          | 7/1024 [31:25<76:59:03, 272.51s/it][AINFO 12-02 02:13:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:13:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:13:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:13:31 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 8/1024 [35:28<74:13:43, 263.02s/it][A
+                                                    [A{'loss': 0.0309, 'grad_norm': 0.0037453947588801384, 'learning_rate': 1e-05, 'num_tokens': 2919219.0, 'completions/mean_length': 4199.625, 'completions/min_length': 625.0, 'completions/max_length': 15827.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4199.625, 'completions/min_terminated_length': 625.0, 'completions/max_terminated_length': 15827.0, 'rewards/accuracy_reward/mean': 0.8125, 'rewards/accuracy_reward/std': 0.39339789748191833, 'reward': 0.8125, 'reward_std': 0.364027738571167, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015800442546606064, 'sampling/sampling_logp_difference/max': 1.1533584594726562, 'sampling/importance_sampling_ratio/min': 0.31557512283325195, 'sampling/importance_sampling_ratio/mean': 1.000178575515747, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.39248377084732056, 'clip_ratio/low_mean': 0.00015342943243012996, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.198954765248345e-05, 'clip_ratio/high_max': 0.00024346400823560543, 'clip_ratio/region_mean': 0.00023541897962786607, 'epoch': 0.0}
+
+  1%|          | 8/1024 [35:28<74:13:43, 263.02s/it][AINFO 12-02 02:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:33 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 9/1024 [40:06<75:30:56, 267.84s/it][A
+                                                    [A{'loss': 0.0068, 'grad_norm': 0.005575446877628565, 'learning_rate': 1e-05, 'num_tokens': 3293247.0, 'completions/mean_length': 5694.8125, 'completions/min_length': 704.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5525.14306640625, 'completions/min_terminated_length': 704.0, 'completions/max_terminated_length': 16360.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.398196816444397, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017013823613524437, 'sampling/sampling_logp_difference/max': 1.8705517053604126, 'sampling/importance_sampling_ratio/min': 0.15403865277767181, 'sampling/importance_sampling_ratio/mean': 0.9999315142631531, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.49477944523096085, 'clip_ratio/low_mean': 0.00035010947613045573, 'clip_ratio/low_min': 7.05023376212921e-05, 'clip_ratio/high_mean': 5.773717441570625e-05, 'clip_ratio/high_max': 0.0001517664118182438, 'clip_ratio/region_mean': 0.0004078466508872225, 'epoch': 0.0}
+
+  1%|          | 9/1024 [40:06<75:30:56, 267.84s/it][AINFO 12-02 02:22:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:22:12 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 10/1024 [44:54<77:10:16, 273.98s/it][A
+                                                     [A{'loss': -0.0655, 'grad_norm': 0.002770514925941825, 'learning_rate': 1e-05, 'num_tokens': 3762697.0, 'completions/mean_length': 7163.15625, 'completions/min_length': 908.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6548.43359375, 'completions/min_terminated_length': 908.0, 'completions/max_terminated_length': 15183.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.390625, 'reward_std': 0.41187620162963867, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020484812557697296, 'sampling/sampling_logp_difference/max': 2.3162002563476562, 'sampling/importance_sampling_ratio/min': 0.17125391960144043, 'sampling/importance_sampling_ratio/mean': 0.9999579191207886, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5555425211787224, 'clip_ratio/low_mean': 0.00025541063223499805, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.08127639527811e-05, 'clip_ratio/high_max': 8.86804000401753e-05, 'clip_ratio/region_mean': 0.00028622339414141607, 'epoch': 0.0}
+
+  1%|          | 10/1024 [44:54<77:10:16, 273.98s/it][AINFO 12-02 02:27:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:27:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:27:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:27:00 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 11/1024 [49:38<77:57:04, 277.02s/it][A
+                                                     [A{'loss': 0.0174, 'grad_norm': 0.0028343338053673506, 'learning_rate': 1e-05, 'num_tokens': 4084789.0, 'completions/mean_length': 4877.8125, 'completions/min_length': 539.0, 'completions/max_length': 16287.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4877.8125, 'completions/min_terminated_length': 539.0, 'completions/max_terminated_length': 16287.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.3934885859489441, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014944921247661114, 'sampling/sampling_logp_difference/max': 2.2180209159851074, 'sampling/importance_sampling_ratio/min': 0.10882426798343658, 'sampling/importance_sampling_ratio/mean': 1.0000348091125488, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40718213841319084, 'clip_ratio/low_mean': 0.00017655043006925553, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.636964644712862e-05, 'clip_ratio/high_max': 0.000195881901163375, 'clip_ratio/region_mean': 0.00024292007765325252, 'epoch': 0.01}
+
+  1%|          | 11/1024 [49:38<77:57:04, 277.02s/it][AINFO 12-02 02:31:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:31:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:31:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:31:44 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|          | 12/1024 [52:24<68:22:22, 243.22s/it][A
+                                                     [A{'loss': -0.1105, 'grad_norm': 0.0031670823227614164, 'learning_rate': 1e-05, 'num_tokens': 4322278.0, 'completions/mean_length': 3574.890625, 'completions/min_length': 551.0, 'completions/max_length': 11164.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3574.890625, 'completions/min_terminated_length': 551.0, 'completions/max_terminated_length': 11164.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.3934885859489441, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.013903278857469559, 'sampling/sampling_logp_difference/max': 1.140014886856079, 'sampling/importance_sampling_ratio/min': 0.3198142647743225, 'sampling/importance_sampling_ratio/mean': 0.9999250173568726, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.39788997918367386, 'clip_ratio/low_mean': 0.00018477893627277808, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.00010585433346932405, 'clip_ratio/high_max': 0.00026642361444828566, 'clip_ratio/region_mean': 0.0002906332756538177, 'epoch': 0.01}
+
+  1%|          | 12/1024 [52:24<68:22:22, 243.22s/it][AINFO 12-02 02:34:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:30 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 13/1024 [57:34<74:01:54, 263.61s/it][A
+                                                     [A{'loss': -0.0266, 'grad_norm': 0.0022498266771435738, 'learning_rate': 1e-05, 'num_tokens': 4901196.0, 'completions/mean_length': 8895.21875, 'completions/min_length': 1190.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 7975.5439453125, 'completions/min_terminated_length': 1190.0, 'completions/max_terminated_length': 15062.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.375, 'reward_std': 0.41186636686325073, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.02287137508392334, 'sampling/sampling_logp_difference/max': 3.691046714782715, 'sampling/importance_sampling_ratio/min': 0.024945877492427826, 'sampling/importance_sampling_ratio/mean': 0.9999292492866516, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5475166365504265, 'clip_ratio/low_mean': 0.0005431086046883138, 'clip_ratio/low_min': 0.00016618422341707628, 'clip_ratio/high_mean': 5.753459049628873e-05, 'clip_ratio/high_max': 0.00013996189318277175, 'clip_ratio/region_mean': 0.000600643197685713, 'epoch': 0.01}
+
+  1%|▏         | 13/1024 [57:34<74:01:54, 263.61s/it][AINFO 12-02 02:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:39:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:39:40 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 14/1024 [1:01:15<70:17:32, 250.55s/it][A
+                                                       [A{'loss': 0.0962, 'grad_norm': 0.0026783738285303116, 'learning_rate': 1e-05, 'num_tokens': 5246384.0, 'completions/mean_length': 5244.4375, 'completions/min_length': 918.0, 'completions/max_length': 14752.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5244.4375, 'completions/min_terminated_length': 918.0, 'completions/max_terminated_length': 14752.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.4050266742706299, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016051635146141052, 'sampling/sampling_logp_difference/max': 5.199403762817383, 'sampling/importance_sampling_ratio/min': 0.005519854370504618, 'sampling/importance_sampling_ratio/mean': 1.0000123977661133, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.47294290736317635, 'clip_ratio/low_mean': 0.0002754747401922941, 'clip_ratio/low_min': 2.1373001800384372e-05, 'clip_ratio/high_mean': 6.068926120406104e-05, 'clip_ratio/high_max': 0.0001712799453343905, 'clip_ratio/region_mean': 0.00033616399559832644, 'epoch': 0.01}
+
+  1%|▏         | 14/1024 [1:01:15<70:17:32, 250.55s/it][AINFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache
+
+  1%|▏         | 15/1024 [1:05:21<69:52:25, 249.30s/it][A
+                                                       [A{'loss': 0.0898, 'grad_norm': 0.004250024911016226, 'learning_rate': 1e-05, 'num_tokens': 5547083.0, 'completions/mean_length': 4557.296875, 'completions/min_length': 766.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4369.57177734375, 'completions/min_terminated_length': 766.0, 'completions/max_terminated_length': 16097.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.3745020925998688, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014819911681115627, 'sampling/sampling_logp_difference/max': 2.1122794151306152, 'sampling/importance_sampling_ratio/min': 0.12096192687749863, 'sampling/importance_sampling_ratio/mean': 1.0000675916671753, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.37265290692448616, 'clip_ratio/low_mean': 0.0002363620080814144, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.377000345812121e-05, 'clip_ratio/high_max': 0.00015819834061403526, 'clip_ratio/region_mean': 0.0002801320106300409, 'epoch': 0.01}
+
+  1%|▏         | 15/1024 [1:05:21<69:52:25, 249.30s/it][AINFO 12-02 02:47:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:47:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:47:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:47:27 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 16/1024 [1:08:38<65:22:57, 233.51s/it][A
+                                                       [A{'loss': -0.0558, 'grad_norm': 0.0019481063354760408, 'learning_rate': 1e-05, 'num_tokens': 5805709.0, 'completions/mean_length': 3880.90625, 'completions/min_length': 310.0, 'completions/max_length': 12898.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3880.90625, 'completions/min_terminated_length': 310.0, 'completions/max_terminated_length': 12898.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.30721205472946167, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01651676371693611, 'sampling/sampling_logp_difference/max': 1.478200912475586, 'sampling/importance_sampling_ratio/min': 0.23560135066509247, 'sampling/importance_sampling_ratio/mean': 0.9999339580535889, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4547530673444271, 'clip_ratio/low_mean': 0.00015485709081985988, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.57152956794016e-05, 'clip_ratio/high_max': 0.0002456310794514138, 'clip_ratio/region_mean': 0.00023057238649926148, 'epoch': 0.01}
+
+  2%|▏         | 16/1024 [1:08:38<65:22:57, 233.51s/it][AINFO 12-02 02:50:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:50:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:50:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:50:44 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 17/1024 [1:13:49<71:52:36, 256.96s/it][A
+                                                       [A{'loss': 0.0732, 'grad_norm': 0.001054947730153799, 'learning_rate': 1e-05, 'num_tokens': 6160101.0, 'completions/mean_length': 5388.375, 'completions/min_length': 1090.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 4250.896484375, 'completions/min_terminated_length': 1090.0, 'completions/max_terminated_length': 15352.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.23144522309303284, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.015085598453879356, 'sampling/sampling_logp_difference/max': 2.6271817684173584, 'sampling/importance_sampling_ratio/min': 0.07228188216686249, 'sampling/importance_sampling_ratio/mean': 0.9999774098396301, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3895924501121044, 'clip_ratio/low_mean': 9.646763464843389e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.7765974613867e-05, 'clip_ratio/high_max': 0.00012618938671948854, 'clip_ratio/region_mean': 0.00014423360971704824, 'epoch': 0.01}
+
+  2%|▏         | 17/1024 [1:13:49<71:52:36, 256.96s/it][AINFO 12-02 02:55:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:55:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:55:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:55:55 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 18/1024 [1:16:54<65:42:10, 235.12s/it][A
+                                                       [A{'loss': 0.1694, 'grad_norm': 0.0028892988339066505, 'learning_rate': 1e-05, 'num_tokens': 6335482.0, 'completions/mean_length': 2596.828125, 'completions/min_length': 622.0, 'completions/max_length': 12920.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 2596.828125, 'completions/min_terminated_length': 622.0, 'completions/max_terminated_length': 12920.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.30250388383865356, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.011103827506303787, 'sampling/sampling_logp_difference/max': 0.7069098949432373, 'sampling/importance_sampling_ratio/min': 0.4931657910346985, 'sampling/importance_sampling_ratio/mean': 0.9999057054519653, 'sampling/importance_sampling_ratio/max': 1.8092303276062012, 'entropy': 0.34704236313700676, 'clip_ratio/low_mean': 0.00013971797488920856, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.663693360067555e-05, 'clip_ratio/high_max': 0.00013487749129126314, 'clip_ratio/region_mean': 0.00017635491076362086, 'epoch': 0.01}
+
+  2%|▏         | 18/1024 [1:16:54<65:42:10, 235.12s/it][AINFO 12-02 02:58:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:58:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:58:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:58:59 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 19/1024 [1:21:13<67:39:51, 242.38s/it][A
+                                                       [A{'loss': 0.0222, 'grad_norm': 0.001230799243785441, 'learning_rate': 1e-05, 'num_tokens': 6616937.0, 'completions/mean_length': 4263.609375, 'completions/min_length': 477.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4071.222412109375, 'completions/min_terminated_length': 477.0, 'completions/max_terminated_length': 15174.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.37981897592544556, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014364926144480705, 'sampling/sampling_logp_difference/max': 3.046107769012451, 'sampling/importance_sampling_ratio/min': 0.04754361882805824, 'sampling/importance_sampling_ratio/mean': 0.9999661445617676, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4048133157193661, 'clip_ratio/low_mean': 0.000164869009040558, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.893295661328011e-05, 'clip_ratio/high_max': 0.00013291119648783933, 'clip_ratio/region_mean': 0.00020380196565383812, 'epoch': 0.01}
+
+  2%|▏         | 19/1024 [1:21:13<67:39:51, 242.38s/it][AINFO 12-02 03:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:03:19 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 20/1024 [1:26:06<71:51:57, 257.69s/it][A
+                                                       [A{'loss': 0.2325, 'grad_norm': 0.003103485330939293, 'learning_rate': 1e-05, 'num_tokens': 6879327.0, 'completions/mean_length': 3965.96875, 'completions/min_length': 724.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 3138.10009765625, 'completions/min_terminated_length': 724.0, 'completions/max_terminated_length': 16065.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.4739636480808258, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.011037546209990978, 'sampling/sampling_logp_difference/max': 1.8783092498779297, 'sampling/importance_sampling_ratio/min': 0.3267216086387634, 'sampling/importance_sampling_ratio/mean': 1.0000557899475098, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.28904543817043304, 'clip_ratio/low_mean': 0.0002175246900151251, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5454003409540746e-05, 'clip_ratio/high_max': 0.00014181601363816299, 'clip_ratio/region_mean': 0.00025297869251517113, 'epoch': 0.01}
+
+  2%|▏         | 20/1024 [1:26:06<71:51:57, 257.69s/it][AINFO 12-02 03:08:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:12 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 21/1024 [1:30:54<74:20:01, 266.80s/it][A
+                                                       [A{'loss': 0.0577, 'grad_norm': 0.003993268124759197, 'learning_rate': 1e-05, 'num_tokens': 7258977.0, 'completions/mean_length': 5802.03125, 'completions/min_length': 173.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5281.6064453125, 'completions/min_terminated_length': 173.0, 'completions/max_terminated_length': 15689.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017135675996541977, 'sampling/sampling_logp_difference/max': 2.0187530517578125, 'sampling/importance_sampling_ratio/min': 0.1328209936618805, 'sampling/importance_sampling_ratio/mean': 1.0000054836273193, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4375212825834751, 'clip_ratio/low_mean': 0.00025796706904657185, 'clip_ratio/low_min': 1.8948006982100196e-05, 'clip_ratio/high_mean': 2.9853191790607525e-05, 'clip_ratio/high_max': 9.734355808177497e-05, 'clip_ratio/region_mean': 0.00028782026674889494, 'epoch': 0.01}
+
+  2%|▏         | 21/1024 [1:30:54<74:20:01, 266.80s/it][AINFO 12-02 03:13:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:13:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:13:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:13:00 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 22/1024 [1:33:55<67:05:50, 241.07s/it][A
+                                                       [A{'loss': 0.0079, 'grad_norm': 0.0019245331641286612, 'learning_rate': 1e-05, 'num_tokens': 7518133.0, 'completions/mean_length': 3869.4375, 'completions/min_length': 518.0, 'completions/max_length': 11434.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3869.4375, 'completions/min_terminated_length': 518.0, 'completions/max_terminated_length': 11434.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.36507582664489746, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016358572989702225, 'sampling/sampling_logp_difference/max': 1.0638248920440674, 'sampling/importance_sampling_ratio/min': 0.3451331853866577, 'sampling/importance_sampling_ratio/mean': 0.9999982714653015, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44892530515789986, 'clip_ratio/low_mean': 0.0003260385524299636, 'clip_ratio/low_min': 0.00011615340554271825, 'clip_ratio/high_mean': 8.728621696718619e-05, 'clip_ratio/high_max': 0.0002894492481573252, 'clip_ratio/region_mean': 0.0004133247721256339, 'epoch': 0.01}
+
+  2%|▏         | 22/1024 [1:33:55<67:05:50, 241.07s/it][AINFO 12-02 03:16:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:16:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:16:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:16:01 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 23/1024 [1:37:50<66:28:41, 239.08s/it][A
+                                                       [A{'loss': -0.0386, 'grad_norm': 0.0019039156613871455, 'learning_rate': 1e-05, 'num_tokens': 7757244.0, 'completions/mean_length': 3547.859375, 'completions/min_length': 335.0, 'completions/max_length': 14289.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3547.859375, 'completions/min_terminated_length': 335.0, 'completions/max_terminated_length': 14289.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.2824692726135254, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.014702252112329006, 'sampling/sampling_logp_difference/max': 0.9164333343505859, 'sampling/importance_sampling_ratio/min': 0.4224785566329956, 'sampling/importance_sampling_ratio/mean': 0.9998500943183899, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3982946574687958, 'clip_ratio/low_mean': 0.00014070415636524558, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.31793488234689e-05, 'clip_ratio/high_max': 0.0002496292036084924, 'clip_ratio/region_mean': 0.0002338834947295254, 'epoch': 0.01}
+
+  2%|▏         | 23/1024 [1:37:50<66:28:41, 239.08s/it][AINFO 12-02 03:19:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:56 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 24/1024 [1:42:06<67:51:10, 244.27s/it][A
+                                                       [A{'loss': 0.1435, 'grad_norm': 0.0017547798343002796, 'learning_rate': 1e-05, 'num_tokens': 8062535.0, 'completions/mean_length': 4636.046875, 'completions/min_length': 1014.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4257.08056640625, 'completions/min_terminated_length': 1014.0, 'completions/max_terminated_length': 12487.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.35612428188323975, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.015055421739816666, 'sampling/sampling_logp_difference/max': 1.8298161029815674, 'sampling/importance_sampling_ratio/min': 0.16044306755065918, 'sampling/importance_sampling_ratio/mean': 0.999926745891571, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3761509396135807, 'clip_ratio/low_mean': 0.00022968862685956992, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.235486509671318e-05, 'clip_ratio/high_max': 0.0002391471316514071, 'clip_ratio/region_mean': 0.00030204348877305165, 'epoch': 0.01}
+
+  2%|▏         | 24/1024 [1:42:06<67:51:10, 244.27s/it][AINFO 12-02 03:24:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:24:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:24:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:24:12 [block_pool.py:292] Successfully reset prefix cache
+
+  2%|▏         | 25/1024 [1:47:11<72:48:43, 262.39s/it][A
+                                                       [A{'loss': 0.1151, 'grad_norm': 0.001728499075397849, 'learning_rate': 1e-05, 'num_tokens': 8566945.0, 'completions/mean_length': 7714.03125, 'completions/min_length': 1092.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 6475.46435546875, 'completions/min_terminated_length': 1092.0, 'completions/max_terminated_length': 14819.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42695629596710205, 'reward': 0.234375, 'reward_std': 0.2472364753484726, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020575616508722305, 'sampling/sampling_logp_difference/max': 5.522520065307617, 'sampling/importance_sampling_ratio/min': 0.003995765931904316, 'sampling/importance_sampling_ratio/mean': 1.000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5347900986671448, 'clip_ratio/low_mean': 0.0002447983779347851, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.221754079480888e-05, 'clip_ratio/high_max': 0.00012219211930641904, 'clip_ratio/region_mean': 0.00028701591509161517, 'epoch': 0.01}
+
+  2%|▏         | 25/1024 [1:47:11<72:48:43, 262.39s/it][AINFO 12-02 03:29:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:29:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:29:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:29:17 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 26/1024 [1:51:48<73:56:27, 266.72s/it][A
+                                                       [A{'loss': 0.0642, 'grad_norm': 0.0006995322764851153, 'learning_rate': 1e-05, 'num_tokens': 9003354.0, 'completions/mean_length': 6643.015625, 'completions/min_length': 838.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 5635.32763671875, 'completions/min_terminated_length': 838.0, 'completions/max_terminated_length': 15118.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.308285653591156, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019060179591178894, 'sampling/sampling_logp_difference/max': 2.4787216186523438, 'sampling/importance_sampling_ratio/min': 0.08385035395622253, 'sampling/importance_sampling_ratio/mean': 1.0001736879348755, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4595770016312599, 'clip_ratio/low_mean': 0.0002470407755481574, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.235011275952274e-05, 'clip_ratio/high_max': 0.0002444806277708267, 'clip_ratio/region_mean': 0.00031939088376020663, 'epoch': 0.01}
+
+  3%|▎         | 26/1024 [1:51:48<73:56:27, 266.72s/it][AINFO 12-02 03:33:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:33:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:33:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:33:53 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 27/1024 [1:56:35<75:32:10, 272.75s/it][A
+                                                       [A{'loss': 0.0113, 'grad_norm': 0.002966905478388071, 'learning_rate': 1e-05, 'num_tokens': 9324623.0, 'completions/mean_length': 4861.203125, 'completions/min_length': 397.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4489.5, 'completions/min_terminated_length': 397.0, 'completions/max_terminated_length': 14157.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.46875, 'reward_std': 0.4092699885368347, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017620380967855453, 'sampling/sampling_logp_difference/max': 1.4601645469665527, 'sampling/importance_sampling_ratio/min': 0.2321980744600296, 'sampling/importance_sampling_ratio/mean': 0.9999405145645142, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.47994261234998703, 'clip_ratio/low_mean': 0.00021479013139469316, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.289386768301483e-05, 'clip_ratio/high_max': 9.920260345097631e-05, 'clip_ratio/region_mean': 0.0002476839963492239, 'epoch': 0.01}
+
+  3%|▎         | 27/1024 [1:56:35<75:32:10, 272.75s/it][AINFO 12-02 03:38:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:38:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:38:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:38:40 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 28/1024 [2:01:24<76:48:46, 277.64s/it][A
+                                                       [A{'loss': 0.0969, 'grad_norm': 0.0020928422454744577, 'learning_rate': 1e-05, 'num_tokens': 9726777.0, 'completions/mean_length': 6133.78125, 'completions/min_length': 441.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 4874.982421875, 'completions/min_terminated_length': 441.0, 'completions/max_terminated_length': 15778.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.4297792911529541, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016771702095866203, 'sampling/sampling_logp_difference/max': 30.104032516479492, 'sampling/importance_sampling_ratio/min': 8.43305189134752e-14, 'sampling/importance_sampling_ratio/mean': 0.9999968409538269, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44855018705129623, 'clip_ratio/low_mean': 0.00025693040788610233, 'clip_ratio/low_min': 2.301390122738667e-05, 'clip_ratio/high_mean': 8.139909368765075e-05, 'clip_ratio/high_max': 0.00023477334616472945, 'clip_ratio/region_mean': 0.00033832949884526897, 'epoch': 0.01}
+
+  3%|▎         | 28/1024 [2:01:24<76:48:46, 277.64s/it][AINFO 12-02 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 29/1024 [2:05:52<75:59:22, 274.94s/it][A
+                                                       [A{'loss': 0.0082, 'grad_norm': 0.0034881241153925657, 'learning_rate': 1e-05, 'num_tokens': 9980523.0, 'completions/mean_length': 3806.40625, 'completions/min_length': 752.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3606.76220703125, 'completions/min_terminated_length': 752.0, 'completions/max_terminated_length': 15735.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.31983357667922974, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01603037491440773, 'sampling/sampling_logp_difference/max': 1.6224892139434814, 'sampling/importance_sampling_ratio/min': 0.19740669429302216, 'sampling/importance_sampling_ratio/mean': 1.000074863433838, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43768391758203506, 'clip_ratio/low_mean': 0.00018532260037318338, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.005594908245257e-05, 'clip_ratio/high_max': 0.00012022379632981028, 'clip_ratio/region_mean': 0.0002153785517293727, 'epoch': 0.01}
+
+  3%|▎         | 29/1024 [2:05:52<75:59:22, 274.94s/it][AINFO 12-02 03:47:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:47:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:47:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:47:58 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 30/1024 [2:10:39<76:54:01, 278.51s/it][A
+                                                       [A{'loss': 0.0514, 'grad_norm': 0.0024828913155943155, 'learning_rate': 1e-05, 'num_tokens': 10366147.0, 'completions/mean_length': 5863.5, 'completions/min_length': 193.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 4775.17236328125, 'completions/min_terminated_length': 193.0, 'completions/max_terminated_length': 15412.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.36507585644721985, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018751230090856552, 'sampling/sampling_logp_difference/max': 2.8303022384643555, 'sampling/importance_sampling_ratio/min': 0.058995019644498825, 'sampling/importance_sampling_ratio/mean': 0.9999738931655884, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5012530460953712, 'clip_ratio/low_mean': 0.0001690702192718163, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.684838197339559e-05, 'clip_ratio/high_max': 0.0001607731164767756, 'clip_ratio/region_mean': 0.00022591860442844336, 'epoch': 0.01}
+
+  3%|▎         | 30/1024 [2:10:39<76:54:01, 278.51s/it][AINFO 12-02 03:52:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:52:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:52:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:52:45 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 31/1024 [2:15:28<77:40:34, 281.61s/it][A
+                                                       [A{'loss': -0.0264, 'grad_norm': 0.002344103530049324, 'learning_rate': 1e-05, 'num_tokens': 10743911.0, 'completions/mean_length': 5764.0625, 'completions/min_length': 636.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5241.77001953125, 'completions/min_terminated_length': 636.0, 'completions/max_terminated_length': 13967.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.390625, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01674613729119301, 'sampling/sampling_logp_difference/max': 6.569549560546875, 'sampling/importance_sampling_ratio/min': 0.001402428955771029, 'sampling/importance_sampling_ratio/mean': 0.9999400973320007, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4986870177090168, 'clip_ratio/low_mean': 0.00013469393161358312, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1034340988990152e-05, 'clip_ratio/high_max': 7.796757199685089e-05, 'clip_ratio/region_mean': 0.00015572827214782592, 'epoch': 0.01}
+
+  3%|▎         | 31/1024 [2:15:28<77:40:34, 281.61s/it][AINFO 12-02 03:57:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:57:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:57:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:57:34 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 32/1024 [2:19:56<76:27:49, 277.49s/it][A
+                                                       [A{'loss': 0.0255, 'grad_norm': 0.0024679740890860558, 'learning_rate': 1e-05, 'num_tokens': 11097210.0, 'completions/mean_length': 5378.046875, 'completions/min_length': 280.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4836.77001953125, 'completions/min_terminated_length': 280.0, 'completions/max_terminated_length': 14608.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.434487521648407, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014684667810797691, 'sampling/sampling_logp_difference/max': 1.6007983684539795, 'sampling/importance_sampling_ratio/min': 0.20173537731170654, 'sampling/importance_sampling_ratio/mean': 0.9999722242355347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.36454083770513535, 'clip_ratio/low_mean': 0.00022259017532633152, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.262989265655051e-05, 'clip_ratio/high_max': 0.0001950213127202005, 'clip_ratio/region_mean': 0.00029522006025217706, 'epoch': 0.01}
+
+  3%|▎         | 32/1024 [2:19:56<76:27:49, 277.49s/it][AINFO 12-02 04:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:02:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:02:02 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 33/1024 [2:24:25<75:42:38, 275.03s/it][A
+                                                       [A{'loss': -0.0006, 'grad_norm': 0.002607004251331091, 'learning_rate': 1e-05, 'num_tokens': 11430039.0, 'completions/mean_length': 5045.703125, 'completions/min_length': 518.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4865.73046875, 'completions/min_terminated_length': 518.0, 'completions/max_terminated_length': 15090.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.34717273712158203, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01814151369035244, 'sampling/sampling_logp_difference/max': 9.249975204467773, 'sampling/importance_sampling_ratio/min': 9.61140394792892e-05, 'sampling/importance_sampling_ratio/mean': 0.9999929666519165, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5083163343369961, 'clip_ratio/low_mean': 0.0001268563953544799, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.592065233235189e-05, 'clip_ratio/high_max': 0.0002190020522903069, 'clip_ratio/region_mean': 0.00019277704541309504, 'epoch': 0.02}
+
+  3%|▎         | 33/1024 [2:24:25<75:42:38, 275.03s/it][AINFO 12-02 04:06:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:31 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 34/1024 [2:29:08<76:15:19, 277.29s/it][A
+                                                       [A{'loss': -0.0385, 'grad_norm': 0.0017760021146386862, 'learning_rate': 1e-05, 'num_tokens': 11765256.0, 'completions/mean_length': 5051.140625, 'completions/min_length': 812.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4493.78662109375, 'completions/min_terminated_length': 812.0, 'completions/max_terminated_length': 14872.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.31983357667922974, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01773376762866974, 'sampling/sampling_logp_difference/max': 3.987311363220215, 'sampling/importance_sampling_ratio/min': 0.018549520522356033, 'sampling/importance_sampling_ratio/mean': 1.000049352645874, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5182696953415871, 'clip_ratio/low_mean': 0.00018071049362333724, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.596040323827765e-05, 'clip_ratio/high_max': 0.00011372525295882951, 'clip_ratio/region_mean': 0.00021667089822585694, 'epoch': 0.02}
+
+  3%|▎         | 34/1024 [2:29:08<76:15:19, 277.29s/it][AINFO 12-02 04:11:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:11:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:11:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:11:13 [block_pool.py:292] Successfully reset prefix cache
+
+  3%|▎         | 35/1024 [2:33:38<75:34:45, 275.11s/it][A
+                                                       [A{'loss': 0.1304, 'grad_norm': 0.005153927020728588, 'learning_rate': 1e-05, 'num_tokens': 12100470.0, 'completions/mean_length': 5061.34375, 'completions/min_length': 971.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 4306.5, 'completions/min_terminated_length': 971.0, 'completions/max_terminated_length': 16123.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.36507582664489746, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014922235161066055, 'sampling/sampling_logp_difference/max': 2.1693801879882812, 'sampling/importance_sampling_ratio/min': 0.11424840986728668, 'sampling/importance_sampling_ratio/mean': 0.9999926090240479, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3710443638265133, 'clip_ratio/low_mean': 0.0002405905520390661, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.9990677224705e-05, 'clip_ratio/high_max': 0.0002264582162752049, 'clip_ratio/region_mean': 0.00033058122244256083, 'epoch': 0.02}
+
+  3%|▎         | 35/1024 [2:33:38<75:34:45, 275.11s/it][AINFO 12-02 04:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:15:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:15:43 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 36/1024 [2:38:16<75:46:11, 276.08s/it][A
+                                                       [A{'loss': 0.0801, 'grad_norm': 0.0033912325743585825, 'learning_rate': 1e-05, 'num_tokens': 12448829.0, 'completions/mean_length': 5308.484375, 'completions/min_length': 696.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4951.20947265625, 'completions/min_terminated_length': 696.0, 'completions/max_terminated_length': 15851.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.28247910737991333, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.014912934973835945, 'sampling/sampling_logp_difference/max': 1.439845323562622, 'sampling/importance_sampling_ratio/min': 0.2369644045829773, 'sampling/importance_sampling_ratio/mean': 0.9999480247497559, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.41571106389164925, 'clip_ratio/low_mean': 7.256042272274499e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.681889870676969e-05, 'clip_ratio/high_max': 0.000130166367853235, 'clip_ratio/region_mean': 0.00010937932052001997, 'epoch': 0.02}
+
+  4%|▎         | 36/1024 [2:38:16<75:46:11, 276.08s/it][AINFO 12-02 04:20:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:20:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:20:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:20:22 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 37/1024 [2:43:02<76:29:58, 279.03s/it][A
+                                                       [A{'loss': 0.0733, 'grad_norm': 0.0010683785658329725, 'learning_rate': 1e-05, 'num_tokens': 12826012.0, 'completions/mean_length': 5739.984375, 'completions/min_length': 1242.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5396.62890625, 'completions/min_terminated_length': 1242.0, 'completions/max_terminated_length': 15895.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.3266732692718506, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014569239690899849, 'sampling/sampling_logp_difference/max': 1.4225099086761475, 'sampling/importance_sampling_ratio/min': 0.24110810458660126, 'sampling/importance_sampling_ratio/mean': 1.0000569820404053, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3601691238582134, 'clip_ratio/low_mean': 0.0003499828098938451, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.714960223533126e-05, 'clip_ratio/high_max': 0.0001173562181975285, 'clip_ratio/region_mean': 0.0003871324133797316, 'epoch': 0.02}
+
+  4%|▎         | 37/1024 [2:43:02<76:29:58, 279.03s/it][AINFO 12-02 04:25:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:25:08 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▎         | 38/1024 [2:48:11<78:52:03, 287.96s/it][A
+                                                       [A{'loss': 0.0423, 'grad_norm': 0.005096756387501955, 'learning_rate': 1e-05, 'num_tokens': 13184502.0, 'completions/mean_length': 5439.28125, 'completions/min_length': 386.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4511.7626953125, 'completions/min_terminated_length': 386.0, 'completions/max_terminated_length': 15818.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.46875, 'reward_std': 0.25513994693756104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.014763720333576202, 'sampling/sampling_logp_difference/max': 1.8347028493881226, 'sampling/importance_sampling_ratio/min': 0.1596609354019165, 'sampling/importance_sampling_ratio/mean': 1.0000321865081787, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4018342159688473, 'clip_ratio/low_mean': 4.738200595966191e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1322271436474693e-05, 'clip_ratio/high_max': 6.916213669683202e-05, 'clip_ratio/region_mean': 6.870427785088395e-05, 'epoch': 0.02}
+
+  4%|▎         | 38/1024 [2:48:11<78:52:03, 287.96s/it][AINFO 12-02 04:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:16 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 39/1024 [2:52:12<74:56:21, 273.89s/it][A
+                                                       [A{'loss': 0.0312, 'grad_norm': 0.003977675922214985, 'learning_rate': 1e-05, 'num_tokens': 13406943.0, 'completions/mean_length': 3341.890625, 'completions/min_length': 851.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3134.873291015625, 'completions/min_terminated_length': 851.0, 'completions/max_terminated_length': 14532.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.3934885859489441, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01334306225180626, 'sampling/sampling_logp_difference/max': 1.2154099941253662, 'sampling/importance_sampling_ratio/min': 0.29658839106559753, 'sampling/importance_sampling_ratio/mean': 1.0000569820404053, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3964227959513664, 'clip_ratio/low_mean': 0.00012399935849316535, 'clip_ratio/low_min': 1.1662623364827596e-05, 'clip_ratio/high_mean': 8.814334091766796e-05, 'clip_ratio/high_max': 0.00026012490434368374, 'clip_ratio/region_mean': 0.00021214270645941724, 'epoch': 0.02}
+
+  4%|▍         | 39/1024 [2:52:12<74:56:21, 273.89s/it][AINFO 12-02 04:34:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:34:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:34:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:34:18 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 40/1024 [2:56:54<75:34:17, 276.48s/it][A
+                                                       [A{'loss': -0.0185, 'grad_norm': 0.002520601497963071, 'learning_rate': 1e-05, 'num_tokens': 13720364.0, 'completions/mean_length': 4764.203125, 'completions/min_length': 397.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 3989.55029296875, 'completions/min_terminated_length': 397.0, 'completions/max_terminated_length': 14144.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015582827851176262, 'sampling/sampling_logp_difference/max': 2.0259647369384766, 'sampling/importance_sampling_ratio/min': 0.13186657428741455, 'sampling/importance_sampling_ratio/mean': 1.000061273574829, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4359661601483822, 'clip_ratio/low_mean': 0.0001972071040654555, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.247393442507018e-05, 'clip_ratio/high_max': 9.677437810751144e-05, 'clip_ratio/region_mean': 0.00022968103621678893, 'epoch': 0.02}
+
+  4%|▍         | 40/1024 [2:56:54<75:34:17, 276.48s/it][AINFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 41/1024 [3:01:30<75:25:41, 276.24s/it][A
+                                                       [A{'loss': -0.0039, 'grad_norm': 0.0011015260824933648, 'learning_rate': 1e-05, 'num_tokens': 14048015.0, 'completions/mean_length': 4951.296875, 'completions/min_length': 249.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4582.5, 'completions/min_terminated_length': 249.0, 'completions/max_terminated_length': 15833.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.46875, 'reward_std': 0.3119301199913025, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017535807564854622, 'sampling/sampling_logp_difference/max': 5.550166130065918, 'sampling/importance_sampling_ratio/min': 0.0038868116680532694, 'sampling/importance_sampling_ratio/mean': 0.9999201893806458, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5068747252225876, 'clip_ratio/low_mean': 0.0002064325885839935, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5835109681793256e-05, 'clip_ratio/high_max': 0.00010334043872717302, 'clip_ratio/region_mean': 0.00023226769735629205, 'epoch': 0.02}
+
+  4%|▍         | 41/1024 [3:01:30<75:25:41, 276.24s/it][AINFO 12-02 04:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:43:36 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 42/1024 [3:06:24<76:47:39, 281.53s/it][A
+                                                       [A{'loss': 0.0689, 'grad_norm': 0.0017638427671045065, 'learning_rate': 1e-05, 'num_tokens': 14423831.0, 'completions/mean_length': 5731.5, 'completions/min_length': 756.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 4629.51708984375, 'completions/min_terminated_length': 756.0, 'completions/max_terminated_length': 14559.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.421875, 'reward_std': 0.32878512144088745, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01607200875878334, 'sampling/sampling_logp_difference/max': 2.3187572956085205, 'sampling/importance_sampling_ratio/min': 0.09839578717947006, 'sampling/importance_sampling_ratio/mean': 1.0000231266021729, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4076359234750271, 'clip_ratio/low_mean': 0.0003398085727894795, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0027742519014282e-05, 'clip_ratio/high_max': 4.011097007605713e-05, 'clip_ratio/region_mean': 0.00034983631303475704, 'epoch': 0.02}
+
+  4%|▍         | 42/1024 [3:06:24<76:47:39, 281.53s/it][AINFO 12-02 04:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:48:30 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 43/1024 [3:10:56<75:55:11, 278.60s/it][A
+                                                       [A{'loss': 0.0029, 'grad_norm': 0.003941641189157963, 'learning_rate': 1e-05, 'num_tokens': 14795680.0, 'completions/mean_length': 5643.390625, 'completions/min_length': 746.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5296.91943359375, 'completions/min_terminated_length': 746.0, 'completions/max_terminated_length': 14189.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.3571978807449341, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01851939968764782, 'sampling/sampling_logp_difference/max': 2.8491101264953613, 'sampling/importance_sampling_ratio/min': 0.05789581686258316, 'sampling/importance_sampling_ratio/mean': 0.999857485294342, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.507311012595892, 'clip_ratio/low_mean': 0.0002511427301215008, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.305796940367145e-05, 'clip_ratio/high_max': 0.00012139247428422095, 'clip_ratio/region_mean': 0.0002842006979335565, 'epoch': 0.02}
+
+  4%|▍         | 43/1024 [3:10:56<75:55:11, 278.60s/it][AINFO 12-02 04:53:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:01 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 44/1024 [3:15:49<77:01:49, 282.97s/it][A
+                                                       [A{'loss': -0.0114, 'grad_norm': 0.0008316304883919656, 'learning_rate': 1e-05, 'num_tokens': 15117183.0, 'completions/mean_length': 4869.234375, 'completions/min_length': 587.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4497.7900390625, 'completions/min_terminated_length': 587.0, 'completions/max_terminated_length': 15619.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42695629596710205, 'reward': 0.234375, 'reward_std': 0.23144522309303284, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01583625189960003, 'sampling/sampling_logp_difference/max': 1.6649298667907715, 'sampling/importance_sampling_ratio/min': 0.1892039179801941, 'sampling/importance_sampling_ratio/mean': 0.9999276995658875, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.449595358222723, 'clip_ratio/low_mean': 0.00015151548791436653, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.388216796589404e-05, 'clip_ratio/high_max': 8.421338270636625e-05, 'clip_ratio/region_mean': 0.00017539765440233168, 'epoch': 0.02}
+
+  4%|▍         | 44/1024 [3:15:49<77:01:49, 282.97s/it][AINFO 12-02 04:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:57:55 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 45/1024 [3:20:41<77:42:20, 285.74s/it][A
+                                                       [A{'loss': 0.1009, 'grad_norm': 0.0012375094229355454, 'learning_rate': 1e-05, 'num_tokens': 15618338.0, 'completions/mean_length': 7673.171875, 'completions/min_length': 356.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6934.96630859375, 'completions/min_terminated_length': 356.0, 'completions/max_terminated_length': 16339.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.3913668990135193, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020412705838680267, 'sampling/sampling_logp_difference/max': 3.6121277809143066, 'sampling/importance_sampling_ratio/min': 0.02699434943497181, 'sampling/importance_sampling_ratio/mean': 1.0000618696212769, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4915333352982998, 'clip_ratio/low_mean': 0.0002517688008083496, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.217939628390013e-05, 'clip_ratio/high_max': 0.00021813440071127843, 'clip_ratio/region_mean': 0.0003239481902710395, 'epoch': 0.02}
+
+  4%|▍         | 45/1024 [3:20:41<77:42:20, 285.74s/it][AINFO 12-02 05:02:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:02:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:02:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:02:47 [block_pool.py:292] Successfully reset prefix cache
+
+  4%|▍         | 46/1024 [3:25:52<79:40:29, 293.28s/it][A
+                                                       [A{'loss': 0.0838, 'grad_norm': 0.002584220375865698, 'learning_rate': 1e-05, 'num_tokens': 16080275.0, 'completions/mean_length': 7036.390625, 'completions/min_length': 621.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 5888.4384765625, 'completions/min_terminated_length': 621.0, 'completions/max_terminated_length': 16316.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.39560043811798096, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017400287091732025, 'sampling/sampling_logp_difference/max': 2.9772582054138184, 'sampling/importance_sampling_ratio/min': 0.05093228816986084, 'sampling/importance_sampling_ratio/mean': 1.0000799894332886, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42682281136512756, 'clip_ratio/low_mean': 0.00021741277896580868, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.249252012617944e-05, 'clip_ratio/high_max': 0.00013961499735160032, 'clip_ratio/region_mean': 0.0002699053002288565, 'epoch': 0.02}
+
+  4%|▍         | 46/1024 [3:25:52<79:40:29, 293.28s/it][AINFO 12-02 05:07:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:07:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:07:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:07:58 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 47/1024 [3:30:25<77:57:58, 287.29s/it][A
+                                                       [A{'loss': 0.0372, 'grad_norm': 0.002873660996556282, 'learning_rate': 1e-05, 'num_tokens': 16397319.0, 'completions/mean_length': 4788.6875, 'completions/min_length': 677.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4414.64501953125, 'completions/min_terminated_length': 677.0, 'completions/max_terminated_length': 14499.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4732423722743988, 'reward': 0.328125, 'reward_std': 0.36978405714035034, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014545915648341179, 'sampling/sampling_logp_difference/max': 1.5124142169952393, 'sampling/importance_sampling_ratio/min': 0.36420938372612, 'sampling/importance_sampling_ratio/mean': 1.0000892877578735, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.45166004449129105, 'clip_ratio/low_mean': 0.00021250549070828129, 'clip_ratio/low_min': 2.2743814042769372e-05, 'clip_ratio/high_mean': 1.8304958302906016e-05, 'clip_ratio/high_max': 6.443001529987669e-05, 'clip_ratio/region_mean': 0.00023081044673745055, 'epoch': 0.02}
+
+  5%|▍         | 47/1024 [3:30:25<77:57:58, 287.29s/it][AINFO 12-02 05:12:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:12:31 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 48/1024 [3:35:03<77:05:06, 284.33s/it][A
+                                                       [A{'loss': -0.0185, 'grad_norm': 0.0033531233202666044, 'learning_rate': 1e-05, 'num_tokens': 16719876.0, 'completions/mean_length': 4885.828125, 'completions/min_length': 866.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4703.31787109375, 'completions/min_terminated_length': 866.0, 'completions/max_terminated_length': 14815.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.390625, 'reward_std': 0.36507585644721985, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01921793259680271, 'sampling/sampling_logp_difference/max': 2.941105842590332, 'sampling/importance_sampling_ratio/min': 0.05280729755759239, 'sampling/importance_sampling_ratio/mean': 0.9999918341636658, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6048321798443794, 'clip_ratio/low_mean': 9.073583623830928e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.869914386697928e-06, 'clip_ratio/high_max': 2.7479657546791714e-05, 'clip_ratio/region_mean': 9.760575107975455e-05, 'epoch': 0.02}
+
+  5%|▍         | 48/1024 [3:35:03<77:05:06, 284.33s/it][AINFO 12-02 05:17:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:08 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 49/1024 [3:39:07<73:47:19, 272.45s/it][A
+                                                       [A{'loss': -0.0423, 'grad_norm': 0.003584296675398946, 'learning_rate': 1e-05, 'num_tokens': 17026014.0, 'completions/mean_length': 4591.90625, 'completions/min_length': 357.0, 'completions/max_length': 15691.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4591.90625, 'completions/min_terminated_length': 357.0, 'completions/max_terminated_length': 15691.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.3934885859489441, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.013645332306623459, 'sampling/sampling_logp_difference/max': 2.1520755290985107, 'sampling/importance_sampling_ratio/min': 0.11624263972043991, 'sampling/importance_sampling_ratio/mean': 0.9999136924743652, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.34244759380817413, 'clip_ratio/low_mean': 0.00030390772053578985, 'clip_ratio/low_min': 1.3529603165807202e-05, 'clip_ratio/high_mean': 6.680671458525467e-05, 'clip_ratio/high_max': 0.00018299027760804165, 'clip_ratio/region_mean': 0.0003707144387590233, 'epoch': 0.02}
+
+  5%|▍         | 49/1024 [3:39:07<73:47:19, 272.45s/it][AINFO 12-02 05:21:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:21:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:21:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:21:13 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 50/1024 [3:44:15<76:33:21, 282.96s/it][A
+                                                       [A{'loss': -0.0167, 'grad_norm': 0.0014293681597337127, 'learning_rate': 1e-05, 'num_tokens': 17379352.0, 'completions/mean_length': 5344.03125, 'completions/min_length': 306.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.140625, 'completions/mean_terminated_length': 3537.49072265625, 'completions/min_terminated_length': 306.0, 'completions/max_terminated_length': 13592.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.3266732692718506, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018551204353570938, 'sampling/sampling_logp_difference/max': 1.9798692464828491, 'sampling/importance_sampling_ratio/min': 0.13808728754520416, 'sampling/importance_sampling_ratio/mean': 1.0000156164169312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5176180489361286, 'clip_ratio/low_mean': 0.00024453642708976986, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.379413919901708e-05, 'clip_ratio/high_max': 9.204179787047906e-05, 'clip_ratio/region_mean': 0.00027833056810777634, 'epoch': 0.02}
+
+  5%|▍         | 50/1024 [3:44:15<76:33:21, 282.96s/it][AINFO 12-02 05:26:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:26:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:26:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:26:21 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▍         | 51/1024 [3:47:44<70:29:40, 260.82s/it][A
+                                                       [A{'loss': 0.0143, 'grad_norm': 0.0028012983966618776, 'learning_rate': 1e-05, 'num_tokens': 17678918.0, 'completions/mean_length': 4534.46875, 'completions/min_length': 789.0, 'completions/max_length': 13417.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4534.46875, 'completions/min_terminated_length': 789.0, 'completions/max_terminated_length': 13417.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.4581822156906128, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.01719539426267147, 'sampling/sampling_logp_difference/max': 6.565022945404053, 'sampling/importance_sampling_ratio/min': 0.0014087916351854801, 'sampling/importance_sampling_ratio/mean': 0.9999317526817322, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48525528982281685, 'clip_ratio/low_mean': 0.0002699469782783126, 'clip_ratio/low_min': 5.479251922224648e-05, 'clip_ratio/high_mean': 5.577002548307064e-05, 'clip_ratio/high_max': 0.00013300060527399182, 'clip_ratio/region_mean': 0.00032571700285188854, 'epoch': 0.02}
+
+  5%|▍         | 51/1024 [3:47:44<70:29:40, 260.82s/it][AINFO 12-02 05:29:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:29:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:29:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:29:50 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 52/1024 [3:53:03<75:08:27, 278.30s/it][A
+                                                       [A{'loss': 0.0196, 'grad_norm': 0.003680001711472869, 'learning_rate': 1e-05, 'num_tokens': 18131532.0, 'completions/mean_length': 6930.84375, 'completions/min_length': 682.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6300.6337890625, 'completions/min_terminated_length': 682.0, 'completions/max_terminated_length': 15448.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5039526224136353, 'reward': 0.5, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018073976039886475, 'sampling/sampling_logp_difference/max': 4.784112453460693, 'sampling/importance_sampling_ratio/min': 0.008361541666090488, 'sampling/importance_sampling_ratio/mean': 0.9999715089797974, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4711146056652069, 'clip_ratio/low_mean': 0.00024330906808245345, 'clip_ratio/low_min': 1.236399566550972e-05, 'clip_ratio/high_mean': 4.924563791064429e-05, 'clip_ratio/high_max': 0.0001790639744285727, 'clip_ratio/region_mean': 0.00029255470872158185, 'epoch': 0.02}
+
+  5%|▌         | 52/1024 [3:53:03<75:08:27, 278.30s/it][AINFO 12-02 05:35:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:35:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:35:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:35:09 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 53/1024 [3:57:43<75:12:42, 278.85s/it][A
+                                                       [A{'loss': -0.0042, 'grad_norm': 0.0023457545321434736, 'learning_rate': 1e-05, 'num_tokens': 18544676.0, 'completions/mean_length': 6292.125, 'completions/min_length': 1730.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6131.93701171875, 'completions/min_terminated_length': 1730.0, 'completions/max_terminated_length': 15102.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.49705949425697327, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.017527619376778603, 'sampling/sampling_logp_difference/max': 2.3390486240386963, 'sampling/importance_sampling_ratio/min': 0.0964193269610405, 'sampling/importance_sampling_ratio/mean': 1.0001380443572998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4499824084341526, 'clip_ratio/low_mean': 0.0003625547033152543, 'clip_ratio/low_min': 0.00010805308011185843, 'clip_ratio/high_mean': 6.173703013701015e-05, 'clip_ratio/high_max': 0.00015477807210118044, 'clip_ratio/region_mean': 0.0004242917239025701, 'epoch': 0.02}
+
+  5%|▌         | 53/1024 [3:57:43<75:12:42, 278.85s/it][AINFO 12-02 05:39:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:39:49 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 54/1024 [4:02:42<76:42:30, 284.69s/it][A
+                                                       [A{'loss': 0.0474, 'grad_norm': 0.0026740289758890867, 'learning_rate': 1e-05, 'num_tokens': 18910647.0, 'completions/mean_length': 5584.546875, 'completions/min_length': 180.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4669.3388671875, 'completions/min_terminated_length': 180.0, 'completions/max_terminated_length': 15615.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.4208277463912964, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01411589328199625, 'sampling/sampling_logp_difference/max': 15.370457649230957, 'sampling/importance_sampling_ratio/min': 2.1120055748724553e-07, 'sampling/importance_sampling_ratio/mean': 0.9999420642852783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3640037924051285, 'clip_ratio/low_mean': 0.00020696077945103752, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.082868268706079e-05, 'clip_ratio/high_max': 0.00021294883390510222, 'clip_ratio/region_mean': 0.00029778946372971404, 'epoch': 0.02}
+
+  5%|▌         | 54/1024 [4:02:42<76:42:30, 284.69s/it][AINFO 12-02 05:44:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:44:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:44:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:44:47 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 55/1024 [4:07:08<75:09:36, 279.23s/it][A
+                                                       [A{'loss': 0.1333, 'grad_norm': 0.0020250508096069098, 'learning_rate': 1e-05, 'num_tokens': 19290961.0, 'completions/mean_length': 5788.28125, 'completions/min_length': 920.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5267.18017578125, 'completions/min_terminated_length': 920.0, 'completions/max_terminated_length': 16068.0, 'rewards/accuracy_reward/mean': 0.671875, 'rewards/accuracy_reward/std': 0.4732423722743988, 'reward': 0.671875, 'reward_std': 0.48080334067344666, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.015817370265722275, 'sampling/sampling_logp_difference/max': 1.8585799932479858, 'sampling/importance_sampling_ratio/min': 0.15589384734630585, 'sampling/importance_sampling_ratio/mean': 1.000024676322937, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3667205236852169, 'clip_ratio/low_mean': 0.00019938927835028153, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.979547672221088e-05, 'clip_ratio/high_max': 0.00024353322078241035, 'clip_ratio/region_mean': 0.00027918475461774506, 'epoch': 0.03}
+
+  5%|▌         | 55/1024 [4:07:08<75:09:36, 279.23s/it][AINFO 12-02 05:49:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:49:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:49:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:49:14 [block_pool.py:292] Successfully reset prefix cache
+
+  5%|▌         | 56/1024 [4:12:09<76:48:59, 285.68s/it][A
+                                                       [A{'loss': 0.0789, 'grad_norm': 0.0016630663303658366, 'learning_rate': 1e-05, 'num_tokens': 19670022.0, 'completions/mean_length': 5749.078125, 'completions/min_length': 533.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5040.08349609375, 'completions/min_terminated_length': 533.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.29355230927467346, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01614568941295147, 'sampling/sampling_logp_difference/max': 2.710545539855957, 'sampling/importance_sampling_ratio/min': 0.06650052219629288, 'sampling/importance_sampling_ratio/mean': 0.9999439716339111, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4693554937839508, 'clip_ratio/low_mean': 0.0001710906817606883, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.007581858582853e-05, 'clip_ratio/high_max': 0.00012030327434331411, 'clip_ratio/region_mean': 0.00020116649920964846, 'epoch': 0.03}
+
+  5%|▌         | 56/1024 [4:12:09<76:48:59, 285.68s/it][AINFO 12-02 05:54:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:54:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:54:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:54:14 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 57/1024 [4:16:06<72:49:25, 271.11s/it][A
+                                                       [A{'loss': 0.069, 'grad_norm': 0.0035494917538017035, 'learning_rate': 1e-05, 'num_tokens': 19924898.0, 'completions/mean_length': 3807.9375, 'completions/min_length': 745.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3608.317626953125, 'completions/min_terminated_length': 745.0, 'completions/max_terminated_length': 13615.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.400318443775177, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.013213995844125748, 'sampling/sampling_logp_difference/max': 1.2064995765686035, 'sampling/importance_sampling_ratio/min': 0.29924294352531433, 'sampling/importance_sampling_ratio/mean': 0.9998854398727417, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.35805340856313705, 'clip_ratio/low_mean': 0.00017924150870385347, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.968100812329794e-05, 'clip_ratio/high_max': 0.0002841245750460075, 'clip_ratio/region_mean': 0.00027892251728189876, 'epoch': 0.03}
+
+  6%|▌         | 57/1024 [4:16:06<72:49:25, 271.11s/it][AINFO 12-02 05:58:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:58:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:58:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:58:12 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 58/1024 [4:20:26<71:51:10, 267.78s/it][A
+                                                       [A{'loss': 0.0325, 'grad_norm': 0.0013930411078035831, 'learning_rate': 1e-05, 'num_tokens': 20233543.0, 'completions/mean_length': 4673.953125, 'completions/min_length': 258.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4296.20947265625, 'completions/min_terminated_length': 258.0, 'completions/max_terminated_length': 16244.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.23144522309303284, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017239723354578018, 'sampling/sampling_logp_difference/max': 1.2281160354614258, 'sampling/importance_sampling_ratio/min': 0.292843759059906, 'sampling/importance_sampling_ratio/mean': 0.9999855160713196, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46354416012763977, 'clip_ratio/low_mean': 4.6247843783930875e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.6247843783930875e-05, 'epoch': 0.03}
+
+  6%|▌         | 58/1024 [4:20:26<71:51:10, 267.78s/it][AINFO 12-02 06:02:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:02:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:02:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:02:32 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 59/1024 [4:23:33<65:19:18, 243.69s/it][A
+                                                       [A{'loss': 0.0436, 'grad_norm': 0.0017000267980620265, 'learning_rate': 1e-05, 'num_tokens': 20546222.0, 'completions/mean_length': 4732.734375, 'completions/min_length': 547.0, 'completions/max_length': 12045.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4732.734375, 'completions/min_terminated_length': 547.0, 'completions/max_terminated_length': 12045.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.28460073471069336, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015059003606438637, 'sampling/sampling_logp_difference/max': 1.3706574440002441, 'sampling/importance_sampling_ratio/min': 0.2539399564266205, 'sampling/importance_sampling_ratio/mean': 1.0000501871109009, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.41122225672006607, 'clip_ratio/low_mean': 0.0001343835442639829, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.396984675418935e-05, 'clip_ratio/high_max': 0.00010152040522370953, 'clip_ratio/region_mean': 0.00016835339192766696, 'epoch': 0.03}
+
+  6%|▌         | 59/1024 [4:23:33<65:19:18, 243.69s/it][AINFO 12-02 06:05:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:05:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:05:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:05:39 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 60/1024 [4:26:58<62:08:12, 232.05s/it][A
+                                                       [A{'loss': -0.0284, 'grad_norm': 0.0021763260010629892, 'learning_rate': 1e-05, 'num_tokens': 20746395.0, 'completions/mean_length': 2992.203125, 'completions/min_length': 737.0, 'completions/max_length': 13639.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 2992.203125, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 13639.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.40625, 'reward_std': 0.3208816647529602, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015204581432044506, 'sampling/sampling_logp_difference/max': 1.1182589530944824, 'sampling/importance_sampling_ratio/min': 0.3268483579158783, 'sampling/importance_sampling_ratio/mean': 0.9997838735580444, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4373614974319935, 'clip_ratio/low_mean': 0.00016128654715430457, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.819664561888203e-05, 'clip_ratio/high_max': 0.00015278658247552812, 'clip_ratio/region_mean': 0.0001994831936826813, 'epoch': 0.03}
+
+  6%|▌         | 60/1024 [4:26:58<62:08:12, 232.05s/it][AINFO 12-02 06:09:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:09:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:09:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:09:04 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 61/1024 [4:31:54<67:12:53, 251.27s/it][A
+                                                       [A{'loss': 0.0395, 'grad_norm': 0.001814318704418838, 'learning_rate': 1e-05, 'num_tokens': 21090385.0, 'completions/mean_length': 5202.96875, 'completions/min_length': 697.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 4457.56689453125, 'completions/min_terminated_length': 697.0, 'completions/max_terminated_length': 16316.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.3492845892906189, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01669854298233986, 'sampling/sampling_logp_difference/max': 1.5448095798492432, 'sampling/importance_sampling_ratio/min': 0.2133525013923645, 'sampling/importance_sampling_ratio/mean': 0.9999855160713196, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4734600558876991, 'clip_ratio/low_mean': 0.00015002856798673747, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.256350548028422e-05, 'clip_ratio/high_max': 0.00016216858330153627, 'clip_ratio/region_mean': 0.00021259207551338477, 'epoch': 0.03}
+
+  6%|▌         | 61/1024 [4:31:54<67:12:53, 251.27s/it][AINFO 12-02 06:14:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:14:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:14:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:14:00 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 62/1024 [4:37:33<74:08:39, 277.46s/it][A
+                                                       [A{'loss': 0.0866, 'grad_norm': 0.00457768002524972, 'learning_rate': 1e-05, 'num_tokens': 21634363.0, 'completions/mean_length': 8348.65625, 'completions/min_length': 609.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 7200.75048828125, 'completions/min_terminated_length': 609.0, 'completions/max_terminated_length': 16144.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.421875, 'reward_std': 0.30617380142211914, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018096819519996643, 'sampling/sampling_logp_difference/max': 3.7523508071899414, 'sampling/importance_sampling_ratio/min': 0.023462524637579918, 'sampling/importance_sampling_ratio/mean': 0.9999805688858032, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4502272866666317, 'clip_ratio/low_mean': 0.00027659225452225655, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.408270913496381e-05, 'clip_ratio/high_max': 8.838675921651884e-05, 'clip_ratio/region_mean': 0.00031067496092873625, 'epoch': 0.03}
+
+  6%|▌         | 62/1024 [4:37:33<74:08:39, 277.46s/it][AINFO 12-02 06:19:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:19:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:19:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:19:39 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▌         | 63/1024 [4:42:16<74:32:43, 279.25s/it][A
+                                                       [A{'loss': 0.1246, 'grad_norm': 0.0037536961026489735, 'learning_rate': 1e-05, 'num_tokens': 22036486.0, 'completions/mean_length': 6125.171875, 'completions/min_length': 691.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5962.33349609375, 'completions/min_terminated_length': 691.0, 'completions/max_terminated_length': 15299.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.29826053977012634, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017353136092424393, 'sampling/sampling_logp_difference/max': 2.596637725830078, 'sampling/importance_sampling_ratio/min': 0.18398989737033844, 'sampling/importance_sampling_ratio/mean': 0.9999016523361206, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5074744261801243, 'clip_ratio/low_mean': 0.00016054962543421425, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3295157689062762e-05, 'clip_ratio/high_max': 8.213822820835048e-05, 'clip_ratio/region_mean': 0.00018384478198640863, 'epoch': 0.03}
+
+  6%|▌         | 63/1024 [4:42:16<74:32:43, 279.25s/it][AINFO 12-02 06:24:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:24:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:24:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:24:22 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▋         | 64/1024 [4:47:22<76:32:21, 287.02s/it][A
+                                                       [A{'loss': -0.0187, 'grad_norm': 0.0008877902873791754, 'learning_rate': 1e-05, 'num_tokens': 22406109.0, 'completions/mean_length': 5608.234375, 'completions/min_length': 447.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4695.03369140625, 'completions/min_terminated_length': 447.0, 'completions/max_terminated_length': 15440.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.359375, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015221178531646729, 'sampling/sampling_logp_difference/max': 1.6659399271011353, 'sampling/importance_sampling_ratio/min': 0.18901291489601135, 'sampling/importance_sampling_ratio/mean': 0.9999306201934814, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46297021955251694, 'clip_ratio/low_mean': 7.728620857960777e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.08188305098156e-05, 'clip_ratio/high_max': 0.00011813699711638037, 'clip_ratio/region_mean': 0.00010810503999891807, 'epoch': 0.03}
+
+  6%|▋         | 64/1024 [4:47:22<76:32:21, 287.02s/it][AINFO 12-02 06:29:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▋         | 65/1024 [4:51:32<73:31:10, 275.99s/it][A
+                                                       [A{'loss': 0.011, 'grad_norm': 0.0019912866409868, 'learning_rate': 1e-05, 'num_tokens': 22711155.0, 'completions/mean_length': 4585.09375, 'completions/min_length': 890.0, 'completions/max_length': 16145.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4585.09375, 'completions/min_terminated_length': 890.0, 'completions/max_terminated_length': 16145.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0174114927649498, 'sampling/sampling_logp_difference/max': 1.3542280197143555, 'sampling/importance_sampling_ratio/min': 0.2581464946269989, 'sampling/importance_sampling_ratio/mean': 1.0000087022781372, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.616243090480566, 'clip_ratio/low_mean': 0.00013005092728235468, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4022726620387402e-05, 'clip_ratio/high_max': 7.917385846667457e-05, 'clip_ratio/region_mean': 0.00015407365344799473, 'epoch': 0.03}
+
+  6%|▋         | 65/1024 [4:51:32<73:31:10, 275.99s/it][AINFO 12-02 06:33:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:33:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:33:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:33:37 [block_pool.py:292] Successfully reset prefix cache
+
+  6%|▋         | 66/1024 [4:56:12<73:46:08, 277.21s/it][A
+                                                       [A{'loss': -0.0012, 'grad_norm': 0.0016551906010136008, 'learning_rate': 1e-05, 'num_tokens': 23121266.0, 'completions/mean_length': 6247.359375, 'completions/min_length': 948.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5920.37060546875, 'completions/min_terminated_length': 948.0, 'completions/max_terminated_length': 15595.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.421875, 'reward_std': 0.34246450662612915, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01890268549323082, 'sampling/sampling_logp_difference/max': 14.989118576049805, 'sampling/importance_sampling_ratio/min': 3.092491454026458e-07, 'sampling/importance_sampling_ratio/mean': 0.999890923500061, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4940183646976948, 'clip_ratio/low_mean': 0.0002678729954368464, 'clip_ratio/low_min': 3.0502684239763767e-05, 'clip_ratio/high_mean': 2.5152995249300147e-05, 'clip_ratio/high_max': 8.158514265232952e-05, 'clip_ratio/region_mean': 0.00029302599068614654, 'epoch': 0.03}
+
+  6%|▋         | 66/1024 [4:56:12<73:46:08, 277.21s/it][AINFO 12-02 06:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:38:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:38:18 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 67/1024 [5:00:55<74:11:18, 279.08s/it][A
+                                                       [A{'loss': -0.0064, 'grad_norm': 0.0027090860530734062, 'learning_rate': 1e-05, 'num_tokens': 23429737.0, 'completions/mean_length': 4651.484375, 'completions/min_length': 672.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4465.25439453125, 'completions/min_terminated_length': 672.0, 'completions/max_terminated_length': 14921.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.41186636686325073, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015556230209767818, 'sampling/sampling_logp_difference/max': 21.69475555419922, 'sampling/importance_sampling_ratio/min': 3.7851871637073486e-10, 'sampling/importance_sampling_ratio/mean': 0.9998961687088013, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43849369138479233, 'clip_ratio/low_mean': 0.00016830264212330803, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.345197770471714e-05, 'clip_ratio/high_max': 0.00020198842366880854, 'clip_ratio/region_mean': 0.0002617546280134775, 'epoch': 0.03}
+
+  7%|▋         | 67/1024 [5:00:55<74:11:18, 279.08s/it][AINFO 12-02 06:43:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:43:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:43:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:43:01 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 68/1024 [5:05:15<72:34:03, 273.27s/it][A
+                                                       [A{'loss': -0.0109, 'grad_norm': 0.003926424775272608, 'learning_rate': 1e-05, 'num_tokens': 23711107.0, 'completions/mean_length': 4270.53125, 'completions/min_length': 450.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4078.254150390625, 'completions/min_terminated_length': 450.0, 'completions/max_terminated_length': 15312.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.41186636686325073, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015287546440958977, 'sampling/sampling_logp_difference/max': 4.140946388244629, 'sampling/importance_sampling_ratio/min': 0.015907788649201393, 'sampling/importance_sampling_ratio/mean': 1.0000602006912231, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4087858460843563, 'clip_ratio/low_mean': 8.60070938415447e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.120277832153079e-05, 'clip_ratio/high_max': 0.0002235091933471267, 'clip_ratio/region_mean': 0.0001772098673882283, 'epoch': 0.03}
+
+  7%|▋         | 68/1024 [5:05:15<72:34:03, 273.27s/it][AINFO 12-02 06:47:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:21 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 69/1024 [5:10:10<74:15:34, 279.93s/it][A
+                                                       [A{'loss': 0.0301, 'grad_norm': 0.0014420230872929096, 'learning_rate': 1e-05, 'num_tokens': 24120646.0, 'completions/mean_length': 6248.421875, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5921.4677734375, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 15902.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.40625, 'reward_std': 0.32666343450546265, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019748248159885406, 'sampling/sampling_logp_difference/max': 2.7074122428894043, 'sampling/importance_sampling_ratio/min': 0.06670921295881271, 'sampling/importance_sampling_ratio/mean': 0.9999963045120239, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5241492316126823, 'clip_ratio/low_mean': 0.0003700140823639231, 'clip_ratio/low_min': 2.9134133001207374e-05, 'clip_ratio/high_mean': 5.2489685231194017e-05, 'clip_ratio/high_max': 0.00015725854427728336, 'clip_ratio/region_mean': 0.00042250377373420633, 'epoch': 0.03}
+
+  7%|▋         | 69/1024 [5:10:10<74:15:34, 279.93s/it][AINFO 12-02 06:52:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:52:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:52:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:52:16 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 70/1024 [5:15:20<76:30:56, 288.74s/it][A
+                                                       [A{'loss': 0.0015, 'grad_norm': 0.0019926258828490973, 'learning_rate': 1e-05, 'num_tokens': 24652683.0, 'completions/mean_length': 8165.328125, 'completions/min_length': 828.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.140625, 'completions/mean_terminated_length': 6820.4541015625, 'completions/min_terminated_length': 828.0, 'completions/max_terminated_length': 16258.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.4113916754722595, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018958844244480133, 'sampling/sampling_logp_difference/max': 3.4632158279418945, 'sampling/importance_sampling_ratio/min': 0.03132885321974754, 'sampling/importance_sampling_ratio/mean': 1.000152826309204, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46431687101721764, 'clip_ratio/low_mean': 0.000436262707808055, 'clip_ratio/low_min': 4.860975968767889e-05, 'clip_ratio/high_mean': 6.950050783416373e-05, 'clip_ratio/high_max': 0.00019062381943513174, 'clip_ratio/region_mean': 0.0005057632188254502, 'epoch': 0.03}
+
+  7%|▋         | 70/1024 [5:15:20<76:30:56, 288.74s/it][AINFO 12-02 06:57:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:57:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:57:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:57:25 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 71/1024 [5:20:10<76:31:56, 289.10s/it][A
+                                                       [A{'loss': 0.1188, 'grad_norm': 0.002046707086265087, 'learning_rate': 1e-05, 'num_tokens': 24992151.0, 'completions/mean_length': 5153.5625, 'completions/min_length': 771.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4601.24560546875, 'completions/min_terminated_length': 771.0, 'completions/max_terminated_length': 13119.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.3492845892906189, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016459595412015915, 'sampling/sampling_logp_difference/max': 1.838377833366394, 'sampling/importance_sampling_ratio/min': 0.1590752750635147, 'sampling/importance_sampling_ratio/mean': 1.0000321865081787, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5756671689450741, 'clip_ratio/low_mean': 0.00014396043616216048, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.583491747529479e-05, 'clip_ratio/high_max': 0.0001736442391120363, 'clip_ratio/region_mean': 0.0001997953531827079, 'epoch': 0.03}
+
+  7%|▋         | 71/1024 [5:20:10<76:31:56, 289.10s/it][AINFO 12-02 07:02:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:02:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:02:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:02:15 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 72/1024 [5:25:11<77:24:54, 292.75s/it][A
+                                                       [A{'loss': -0.0159, 'grad_norm': 0.0011077187955379486, 'learning_rate': 1e-05, 'num_tokens': 25367594.0, 'completions/mean_length': 5735.171875, 'completions/min_length': 335.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 4633.56884765625, 'completions/min_terminated_length': 335.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.40625, 'reward_std': 0.30038219690322876, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015588895417749882, 'sampling/sampling_logp_difference/max': 1.869459629058838, 'sampling/importance_sampling_ratio/min': 0.15420696139335632, 'sampling/importance_sampling_ratio/mean': 0.9999856948852539, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4231684133410454, 'clip_ratio/low_mean': 0.0002607224032544764, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8399418422632152e-05, 'clip_ratio/high_max': 7.359767369052861e-05, 'clip_ratio/region_mean': 0.0002791218207676138, 'epoch': 0.03}
+
+  7%|▋         | 72/1024 [5:25:11<77:24:54, 292.75s/it][AINFO 12-02 07:07:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:17 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 73/1024 [5:29:08<72:54:52, 276.02s/it][A
+                                                       [A{'loss': -0.0029, 'grad_norm': 0.0026049406733363867, 'learning_rate': 1e-05, 'num_tokens': 25704684.0, 'completions/mean_length': 5021.65625, 'completions/min_length': 784.0, 'completions/max_length': 14600.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5021.65625, 'completions/min_terminated_length': 784.0, 'completions/max_terminated_length': 14600.0, 'rewards/accuracy_reward/mean': 0.8125, 'rewards/accuracy_reward/std': 0.39339789748191833, 'reward': 0.8125, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.013968106359243393, 'sampling/sampling_logp_difference/max': 5.4766845703125, 'sampling/importance_sampling_ratio/min': 0.004183175507932901, 'sampling/importance_sampling_ratio/mean': 0.9999948740005493, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4137778803706169, 'clip_ratio/low_mean': 0.00011736736541934079, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.716448322345968e-05, 'clip_ratio/high_max': 0.00018995380196429323, 'clip_ratio/region_mean': 0.00017453184818805312, 'epoch': 0.03}
+
+  7%|▋         | 73/1024 [5:29:08<72:54:52, 276.02s/it][AINFO 12-02 07:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:11:14 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 74/1024 [5:34:13<75:06:30, 284.62s/it][A
+                                                       [A{'loss': -0.0002, 'grad_norm': 0.0006604515947401524, 'learning_rate': 1e-05, 'num_tokens': 26169821.0, 'completions/mean_length': 7127.765625, 'completions/min_length': 1149.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 5805.44677734375, 'completions/min_terminated_length': 1149.0, 'completions/max_terminated_length': 15108.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017053252086043358, 'sampling/sampling_logp_difference/max': 3.3366637229919434, 'sampling/importance_sampling_ratio/min': 0.03555538132786751, 'sampling/importance_sampling_ratio/mean': 0.9998608231544495, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46533510461449623, 'clip_ratio/low_mean': 0.00011912480476894416, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.350696483219508e-05, 'clip_ratio/high_max': 0.00014316345004772302, 'clip_ratio/region_mean': 0.00017263177232962335, 'epoch': 0.03}
+
+  7%|▋         | 74/1024 [5:34:13<75:06:30, 284.62s/it][AINFO 12-02 07:16:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:16:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:16:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:16:18 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 75/1024 [5:38:46<74:07:13, 281.17s/it][A
+                                                       [A{'loss': 0.0653, 'grad_norm': 0.0010555664775893092, 'learning_rate': 1e-05, 'num_tokens': 26559947.0, 'completions/mean_length': 5938.71875, 'completions/min_length': 565.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5425.01611328125, 'completions/min_terminated_length': 565.0, 'completions/max_terminated_length': 14190.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.375, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020025379955768585, 'sampling/sampling_logp_difference/max': 1.3287858963012695, 'sampling/importance_sampling_ratio/min': 0.2709495425224304, 'sampling/importance_sampling_ratio/mean': 0.999986469745636, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.552715502679348, 'clip_ratio/low_mean': 0.00019155561699335522, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0714410766231595e-05, 'clip_ratio/high_max': 8.285764306492638e-05, 'clip_ratio/region_mean': 0.00021227002844170784, 'epoch': 0.03}
+
+  7%|▋         | 75/1024 [5:38:46<74:07:13, 281.17s/it][AINFO 12-02 07:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:20:51 [block_pool.py:292] Successfully reset prefix cache
+
+  7%|▋         | 76/1024 [5:43:43<75:16:42, 285.87s/it][A
+                                                       [A{'loss': 0.055, 'grad_norm': 0.0022420468740165234, 'learning_rate': 1e-05, 'num_tokens': 27039744.0, 'completions/mean_length': 7356.203125, 'completions/min_length': 462.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 6247.5263671875, 'completions/min_terminated_length': 462.0, 'completions/max_terminated_length': 15444.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.2961388826370239, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016521483659744263, 'sampling/sampling_logp_difference/max': 4.522876739501953, 'sampling/importance_sampling_ratio/min': 0.010857743211090565, 'sampling/importance_sampling_ratio/mean': 0.9999997615814209, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43883639574050903, 'clip_ratio/low_mean': 0.0001334776297881035, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7387406842317432e-05, 'clip_ratio/high_max': 5.920579860685393e-05, 'clip_ratio/region_mean': 0.0001508650389041577, 'epoch': 0.03}
+
+  7%|▋         | 76/1024 [5:43:43<75:16:42, 285.87s/it][AINFO 12-02 07:25:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:25:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:25:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:25:48 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 77/1024 [5:47:23<70:00:44, 266.15s/it][A
+                                                       [A{'loss': 0.1052, 'grad_norm': 0.0031257288064807653, 'learning_rate': 1e-05, 'num_tokens': 27300956.0, 'completions/mean_length': 3919.5625, 'completions/min_length': 238.0, 'completions/max_length': 13813.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3919.5625, 'completions/min_terminated_length': 238.0, 'completions/max_terminated_length': 13813.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.35612428188323975, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017183197662234306, 'sampling/sampling_logp_difference/max': 1.1030888557434082, 'sampling/importance_sampling_ratio/min': 0.3318444788455963, 'sampling/importance_sampling_ratio/mean': 1.0000545978546143, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5422935001552105, 'clip_ratio/low_mean': 0.00011971614117101126, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.3930947615299374e-05, 'clip_ratio/high_max': 0.00015076155432325322, 'clip_ratio/region_mean': 0.00017364708855893696, 'epoch': 0.04}
+
+  8%|▊         | 77/1024 [5:47:23<70:00:44, 266.15s/it][AINFO 12-02 07:29:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:28 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 78/1024 [5:51:31<68:29:49, 260.67s/it][A
+                                                       [A{'loss': 0.0066, 'grad_norm': 0.0019214291824027896, 'learning_rate': 1e-05, 'num_tokens': 27657559.0, 'completions/mean_length': 5436.046875, 'completions/min_length': 573.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5262.27001953125, 'completions/min_terminated_length': 573.0, 'completions/max_terminated_length': 14167.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.3834536373615265, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017530949786305428, 'sampling/sampling_logp_difference/max': 1.6708216667175293, 'sampling/importance_sampling_ratio/min': 0.188092440366745, 'sampling/importance_sampling_ratio/mean': 1.000032901763916, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46766940876841545, 'clip_ratio/low_mean': 0.00021173169579924433, 'clip_ratio/low_min': 2.255096478620544e-05, 'clip_ratio/high_mean': 5.326506493474881e-05, 'clip_ratio/high_max': 0.00018665866537048714, 'clip_ratio/region_mean': 0.0002649967591423774, 'epoch': 0.04}
+
+  8%|▊         | 78/1024 [5:51:31<68:29:49, 260.67s/it][AINFO 12-02 07:33:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:33:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:33:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:33:36 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 79/1024 [5:55:40<67:33:30, 257.37s/it][A
+                                                       [A{'loss': 0.0795, 'grad_norm': 0.0026777610182762146, 'learning_rate': 1e-05, 'num_tokens': 27962342.0, 'completions/mean_length': 4586.859375, 'completions/min_length': 417.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4399.603515625, 'completions/min_terminated_length': 417.0, 'completions/max_terminated_length': 14477.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.4375, 'reward_std': 0.2756394147872925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017153657972812653, 'sampling/sampling_logp_difference/max': 2.124561309814453, 'sampling/importance_sampling_ratio/min': 0.11948537081480026, 'sampling/importance_sampling_ratio/mean': 0.9998522996902466, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44445816427469254, 'clip_ratio/low_mean': 0.0003093198602073244, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.005037797265686e-05, 'clip_ratio/high_max': 9.148012941295747e-05, 'clip_ratio/region_mean': 0.00033937023545149714, 'epoch': 0.04}
+
+  8%|▊         | 79/1024 [5:55:40<67:33:30, 257.37s/it][AINFO 12-02 07:37:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:37:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:37:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:37:46 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 80/1024 [5:59:19<64:25:45, 245.70s/it][A
+                                                       [A{'loss': 0.0292, 'grad_norm': 0.00130936736240983, 'learning_rate': 1e-05, 'num_tokens': 28273953.0, 'completions/mean_length': 4734.171875, 'completions/min_length': 549.0, 'completions/max_length': 13939.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4734.171875, 'completions/min_terminated_length': 549.0, 'completions/max_terminated_length': 13939.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.31512534618377686, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018276486545801163, 'sampling/sampling_logp_difference/max': 3.8840980529785156, 'sampling/importance_sampling_ratio/min': 0.020566370338201523, 'sampling/importance_sampling_ratio/mean': 1.0000052452087402, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43582708761096, 'clip_ratio/low_mean': 0.00012357529885775875, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.4197754846209136e-05, 'clip_ratio/high_max': 0.00016560633912376943, 'clip_ratio/region_mean': 0.00016777305143023113, 'epoch': 0.04}
+
+  8%|▊         | 80/1024 [5:59:19<64:25:45, 245.70s/it][AINFO 12-02 07:41:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:41:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:41:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:41:24 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 81/1024 [6:03:58<66:57:36, 255.63s/it][A
+                                                       [A{'loss': -0.0074, 'grad_norm': 0.001850566710345447, 'learning_rate': 1e-05, 'num_tokens': 28602952.0, 'completions/mean_length': 4986.859375, 'completions/min_length': 555.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4805.95263671875, 'completions/min_terminated_length': 555.0, 'completions/max_terminated_length': 14214.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.34352827072143555, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016178656369447708, 'sampling/sampling_logp_difference/max': 2.387148380279541, 'sampling/importance_sampling_ratio/min': 0.09189135581254959, 'sampling/importance_sampling_ratio/mean': 0.9999573230743408, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.436476893723011, 'clip_ratio/low_mean': 0.00024071300776995486, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.373167770514556e-05, 'clip_ratio/high_max': 0.00021553437090915395, 'clip_ratio/region_mean': 0.00032444469252368435, 'epoch': 0.04}
+
+  8%|▊         | 81/1024 [6:03:58<66:57:36, 255.63s/it][AINFO 12-02 07:46:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:46:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:46:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:46:03 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 82/1024 [6:08:41<69:06:36, 264.12s/it][A
+                                                       [A{'loss': 0.0183, 'grad_norm': 0.00494338059797883, 'learning_rate': 1e-05, 'num_tokens': 28908204.0, 'completions/mean_length': 4627.9375, 'completions/min_length': 451.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4441.33349609375, 'completions/min_terminated_length': 451.0, 'completions/max_terminated_length': 14755.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.014848449267446995, 'sampling/sampling_logp_difference/max': 1.4697718620300293, 'sampling/importance_sampling_ratio/min': 0.22997793555259705, 'sampling/importance_sampling_ratio/mean': 0.9999882578849792, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4143282398581505, 'clip_ratio/low_mean': 0.00015790256384207169, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.390057327123941e-05, 'clip_ratio/high_max': 0.0001042661187966587, 'clip_ratio/region_mean': 0.00019180313483957434, 'epoch': 0.04}
+
+  8%|▊         | 82/1024 [6:08:41<69:06:36, 264.12s/it][AINFO 12-02 07:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:47 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 83/1024 [6:11:48<62:56:19, 240.79s/it][A
+                                                       [A{'loss': -0.0291, 'grad_norm': 0.0027634254656732082, 'learning_rate': 1e-05, 'num_tokens': 29150542.0, 'completions/mean_length': 3606.03125, 'completions/min_length': 270.0, 'completions/max_length': 12934.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3606.03125, 'completions/min_terminated_length': 270.0, 'completions/max_terminated_length': 12934.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.41398805379867554, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01823471114039421, 'sampling/sampling_logp_difference/max': 0.7953894138336182, 'sampling/importance_sampling_ratio/min': 0.45140543580055237, 'sampling/importance_sampling_ratio/mean': 1.0000288486480713, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4441828317940235, 'clip_ratio/low_mean': 0.0003676890682982048, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.00011490001406855299, 'clip_ratio/high_max': 0.00028493666286522057, 'clip_ratio/region_mean': 0.00048258908464049455, 'epoch': 0.04}
+
+  8%|▊         | 83/1024 [6:11:48<62:56:19, 240.79s/it][AINFO 12-02 07:53:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:54 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 84/1024 [6:15:46<62:41:42, 240.11s/it][A
+                                                       [A{'loss': -0.0122, 'grad_norm': 0.0023619274143129587, 'learning_rate': 1e-05, 'num_tokens': 29471800.0, 'completions/mean_length': 4837.53125, 'completions/min_length': 424.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4654.25439453125, 'completions/min_terminated_length': 424.0, 'completions/max_terminated_length': 13335.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.37981897592544556, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.0174616277217865, 'sampling/sampling_logp_difference/max': 1.7406625747680664, 'sampling/importance_sampling_ratio/min': 0.1754041463136673, 'sampling/importance_sampling_ratio/mean': 1.000072717666626, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4428352490067482, 'clip_ratio/low_mean': 0.0003463904276941321, 'clip_ratio/low_min': 1.804272505978588e-05, 'clip_ratio/high_mean': 6.79084801049612e-05, 'clip_ratio/high_max': 0.0001782336494215997, 'clip_ratio/region_mean': 0.0004142989018873777, 'epoch': 0.04}
+
+  8%|▊         | 84/1024 [6:15:46<62:41:42, 240.11s/it][AINFO 12-02 07:57:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:57:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:57:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:57:52 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 85/1024 [6:19:50<62:56:47, 241.33s/it][A
+                                                       [A{'loss': -0.0359, 'grad_norm': 0.0013597882352769375, 'learning_rate': 1e-05, 'num_tokens': 29726955.0, 'completions/mean_length': 3789.546875, 'completions/min_length': 435.0, 'completions/max_length': 15459.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3789.546875, 'completions/min_terminated_length': 435.0, 'completions/max_terminated_length': 15459.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015852496027946472, 'sampling/sampling_logp_difference/max': 1.3977675437927246, 'sampling/importance_sampling_ratio/min': 0.24714809656143188, 'sampling/importance_sampling_ratio/mean': 1.0000308752059937, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.41566407680511475, 'clip_ratio/low_mean': 0.00015632138047294575, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8899456083308905e-05, 'clip_ratio/high_max': 9.811175550566986e-05, 'clip_ratio/region_mean': 0.00018522083723837568, 'epoch': 0.04}
+
+  8%|▊         | 85/1024 [6:19:51<62:56:47, 241.33s/it][AINFO 12-02 08:01:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:56 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 86/1024 [6:23:08<59:26:14, 228.12s/it][A
+                                                       [A{'loss': 0.0146, 'grad_norm': 0.002951845293864608, 'learning_rate': 1e-05, 'num_tokens': 30011953.0, 'completions/mean_length': 4287.21875, 'completions/min_length': 775.0, 'completions/max_length': 13226.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4287.21875, 'completions/min_terminated_length': 775.0, 'completions/max_terminated_length': 13226.0, 'rewards/accuracy_reward/mean': 0.78125, 'rewards/accuracy_reward/std': 0.4166666865348816, 'reward': 0.78125, 'reward_std': 0.37981897592544556, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014355344697833061, 'sampling/sampling_logp_difference/max': 1.9825925827026367, 'sampling/importance_sampling_ratio/min': 0.13771173357963562, 'sampling/importance_sampling_ratio/mean': 0.9999309778213501, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40521714091300964, 'clip_ratio/low_mean': 0.00011074364056185004, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.814381170741399e-05, 'clip_ratio/high_max': 0.00012764245548169129, 'clip_ratio/region_mean': 0.00014888745135976933, 'epoch': 0.04}
+
+  8%|▊         | 86/1024 [6:23:08<59:26:14, 228.12s/it][AINFO 12-02 08:05:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:05:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:05:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:05:14 [block_pool.py:292] Successfully reset prefix cache
+
+  8%|▊         | 87/1024 [6:27:20<61:14:01, 235.26s/it][A
+                                                       [A{'loss': 0.0496, 'grad_norm': 0.0023449361324310303, 'learning_rate': 1e-05, 'num_tokens': 30287813.0, 'completions/mean_length': 4144.8125, 'completions/min_length': 238.0, 'completions/max_length': 15827.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4144.8125, 'completions/min_terminated_length': 238.0, 'completions/max_terminated_length': 15827.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.3208816647529602, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015097986906766891, 'sampling/sampling_logp_difference/max': 6.042912483215332, 'sampling/importance_sampling_ratio/min': 0.0023746327497065067, 'sampling/importance_sampling_ratio/mean': 0.9999423623085022, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44145946204662323, 'clip_ratio/low_mean': 0.00012137663225075812, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6621919687386253e-05, 'clip_ratio/high_max': 0.00010648767874954501, 'clip_ratio/region_mean': 0.000147998550801276, 'epoch': 0.04}
+
+  8%|▊         | 87/1024 [6:27:20<61:14:01, 235.26s/it][AINFO 12-02 08:09:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:25 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▊         | 88/1024 [6:31:35<62:42:01, 241.16s/it][A
+                                                       [A{'loss': -0.1034, 'grad_norm': 0.001615686109289527, 'learning_rate': 1e-05, 'num_tokens': 30549449.0, 'completions/mean_length': 3949.1875, 'completions/min_length': 434.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 3548.064453125, 'completions/min_terminated_length': 434.0, 'completions/max_terminated_length': 13651.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.32195523381233215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01933361403644085, 'sampling/sampling_logp_difference/max': 5.496237754821777, 'sampling/importance_sampling_ratio/min': 0.004102176055312157, 'sampling/importance_sampling_ratio/mean': 1.0000253915786743, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4849945046007633, 'clip_ratio/low_mean': 5.74980895180488e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.077752601006068e-05, 'clip_ratio/high_max': 0.00016311010404024273, 'clip_ratio/region_mean': 9.827561370912008e-05, 'epoch': 0.04}
+
+  9%|▊         | 88/1024 [6:31:35<62:42:01, 241.16s/it][AINFO 12-02 08:13:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:13:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:13:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:13:40 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▊         | 89/1024 [6:36:26<66:31:16, 256.12s/it][A
+                                                       [A{'loss': 0.0494, 'grad_norm': 0.003227722831070423, 'learning_rate': 1e-05, 'num_tokens': 31004564.0, 'completions/mean_length': 6941.171875, 'completions/min_length': 1097.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6636.564453125, 'completions/min_terminated_length': 1097.0, 'completions/max_terminated_length': 15863.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.3661494255065918, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01690554805099964, 'sampling/sampling_logp_difference/max': 3.3597805500030518, 'sampling/importance_sampling_ratio/min': 0.03474288433790207, 'sampling/importance_sampling_ratio/mean': 0.9999713897705078, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43385689333081245, 'clip_ratio/low_mean': 0.00027034243794332724, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.314225311143673e-05, 'clip_ratio/high_max': 0.00016852732369443402, 'clip_ratio/region_mean': 0.0003334846851430484, 'epoch': 0.04}
+
+  9%|▊         | 89/1024 [6:36:26<66:31:16, 256.12s/it][AINFO 12-02 08:18:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:31 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 90/1024 [6:41:58<72:23:22, 279.02s/it][A
+                                                       [A{'loss': 0.1052, 'grad_norm': 0.003051145700737834, 'learning_rate': 1e-05, 'num_tokens': 31460073.0, 'completions/mean_length': 6929.328125, 'completions/min_length': 825.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 5951.2587890625, 'completions/min_terminated_length': 825.0, 'completions/max_terminated_length': 16069.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.31983357667922974, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018933985382318497, 'sampling/sampling_logp_difference/max': 18.319889068603516, 'sampling/importance_sampling_ratio/min': 1.1060461879708328e-08, 'sampling/importance_sampling_ratio/mean': 1.000002384185791, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5281123965978622, 'clip_ratio/low_mean': 0.00019669710945890984, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.1479594528937014e-05, 'clip_ratio/high_max': 0.00013798837153444765, 'clip_ratio/region_mean': 0.000238176708080573, 'epoch': 0.04}
+
+  9%|▉         | 90/1024 [6:41:58<72:23:22, 279.02s/it][AINFO 12-02 08:24:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:04 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 91/1024 [6:46:36<72:13:14, 278.67s/it][A
+                                                       [A{'loss': 0.099, 'grad_norm': 0.0020714199636131525, 'learning_rate': 1e-05, 'num_tokens': 31881702.0, 'completions/mean_length': 6454.578125, 'completions/min_length': 1242.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5792.6171875, 'completions/min_terminated_length': 1242.0, 'completions/max_terminated_length': 15659.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.2756394147872925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018824318423867226, 'sampling/sampling_logp_difference/max': 16.48392105102539, 'sampling/importance_sampling_ratio/min': 6.936238605703693e-08, 'sampling/importance_sampling_ratio/mean': 0.999925971031189, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5252735465764999, 'clip_ratio/low_mean': 0.00020054339893249562, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.100329399785551e-05, 'clip_ratio/high_max': 0.00011954138335568132, 'clip_ratio/region_mean': 0.00023154669725045096, 'epoch': 0.04}
+
+  9%|▉         | 91/1024 [6:46:36<72:13:14, 278.67s/it][AINFO 12-02 08:28:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:28:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:28:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:28:42 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 92/1024 [6:49:34<64:18:09, 248.38s/it][A
+                                                       [A{'loss': -0.0118, 'grad_norm': 0.002296029357239604, 'learning_rate': 1e-05, 'num_tokens': 32190589.0, 'completions/mean_length': 4675.984375, 'completions/min_length': 210.0, 'completions/max_length': 11263.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4675.984375, 'completions/min_terminated_length': 210.0, 'completions/max_terminated_length': 11263.0, 'rewards/accuracy_reward/mean': 0.78125, 'rewards/accuracy_reward/std': 0.4166666865348816, 'reward': 0.78125, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0159091018140316, 'sampling/sampling_logp_difference/max': 2.2179837226867676, 'sampling/importance_sampling_ratio/min': 0.10882831364870071, 'sampling/importance_sampling_ratio/mean': 1.000171422958374, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3901951462030411, 'clip_ratio/low_mean': 0.0001777295929059619, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.472397146557341e-05, 'clip_ratio/high_max': 0.0002671323782124091, 'clip_ratio/region_mean': 0.00025245356482628267, 'epoch': 0.04}
+
+  9%|▉         | 92/1024 [6:49:34<64:18:09, 248.38s/it][AINFO 12-02 08:31:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:31:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:31:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:31:39 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 93/1024 [6:52:32<58:47:39, 227.35s/it][A
+                                                       [A{'loss': 0.0493, 'grad_norm': 0.003976813983172178, 'learning_rate': 1e-05, 'num_tokens': 32463328.0, 'completions/mean_length': 3949.796875, 'completions/min_length': 718.0, 'completions/max_length': 11713.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3949.796875, 'completions/min_terminated_length': 718.0, 'completions/max_terminated_length': 11713.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.3608325123786926, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016604486852884293, 'sampling/sampling_logp_difference/max': 1.353191614151001, 'sampling/importance_sampling_ratio/min': 0.2584141790866852, 'sampling/importance_sampling_ratio/mean': 1.0000035762786865, 'sampling/importance_sampling_ratio/max': 1.895107388496399, 'entropy': 0.5187858492136002, 'clip_ratio/low_mean': 0.00022115569072411745, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.412918954381894e-05, 'clip_ratio/high_max': 9.62632120717899e-05, 'clip_ratio/region_mean': 0.00025528488322379417, 'epoch': 0.04}
+
+  9%|▉         | 93/1024 [6:52:32<58:47:39, 227.35s/it][AINFO 12-02 08:34:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:34:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:34:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:34:38 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 94/1024 [6:57:23<63:41:15, 246.53s/it][A
+                                                       [A{'loss': 0.2159, 'grad_norm': 0.0019280276028439403, 'learning_rate': 1e-05, 'num_tokens': 32875752.0, 'completions/mean_length': 6298.375, 'completions/min_length': 723.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5443.6611328125, 'completions/min_terminated_length': 723.0, 'completions/max_terminated_length': 16197.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.3845370411872864, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01958397403359413, 'sampling/sampling_logp_difference/max': 6.089755058288574, 'sampling/importance_sampling_ratio/min': 0.00226596393622458, 'sampling/importance_sampling_ratio/mean': 1.0000284910202026, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4672521837055683, 'clip_ratio/low_mean': 0.00025854546129266964, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.00010159406429011142, 'clip_ratio/high_max': 0.00024776975260465406, 'clip_ratio/region_mean': 0.0003601395328587387, 'epoch': 0.04}
+
+  9%|▉         | 94/1024 [6:57:23<63:41:15, 246.53s/it][AINFO 12-02 08:39:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:39:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:39:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:39:29 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 95/1024 [7:02:19<67:27:10, 261.39s/it][A
+                                                       [A{'loss': -0.0241, 'grad_norm': 0.001792717957869172, 'learning_rate': 1e-05, 'num_tokens': 33291246.0, 'completions/mean_length': 6352.84375, 'completions/min_length': 458.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 4919.82177734375, 'completions/min_terminated_length': 458.0, 'completions/max_terminated_length': 14409.0, 'rewards/accuracy_reward/mean': 0.671875, 'rewards/accuracy_reward/std': 0.4732423722743988, 'reward': 0.671875, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016141103580594063, 'sampling/sampling_logp_difference/max': 3.8868091106414795, 'sampling/importance_sampling_ratio/min': 0.020510688424110413, 'sampling/importance_sampling_ratio/mean': 1.0001204013824463, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3965361528098583, 'clip_ratio/low_mean': 0.0002863152385543799, 'clip_ratio/low_min': 1.629301368666347e-05, 'clip_ratio/high_mean': 4.90948746119102e-05, 'clip_ratio/high_max': 0.00013045434934610967, 'clip_ratio/region_mean': 0.0003354100999786169, 'epoch': 0.04}
+
+  9%|▉         | 95/1024 [7:02:19<67:27:10, 261.39s/it][AINFO 12-02 08:44:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:44:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:44:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:44:25 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 96/1024 [7:06:25<66:08:05, 256.56s/it][A
+                                                       [A{'loss': -0.0598, 'grad_norm': 0.00189808732829988, 'learning_rate': 1e-05, 'num_tokens': 33544663.0, 'completions/mean_length': 3824.390625, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 3419.241943359375, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 14085.0, 'rewards/accuracy_reward/mean': 0.796875, 'rewards/accuracy_reward/std': 0.40550529956817627, 'reward': 0.796875, 'reward_std': 0.3845370411872864, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014164968393743038, 'sampling/sampling_logp_difference/max': 3.953634738922119, 'sampling/importance_sampling_ratio/min': 0.019184842705726624, 'sampling/importance_sampling_ratio/mean': 1.0000834465026855, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4274088926613331, 'clip_ratio/low_mean': 9.56206449700403e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.945373115740949e-05, 'clip_ratio/high_max': 0.00017129768730228534, 'clip_ratio/region_mean': 0.0001650743779464392, 'epoch': 0.04}
+
+  9%|▉         | 96/1024 [7:06:25<66:08:05, 256.56s/it][AINFO 12-02 08:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:48:30 [block_pool.py:292] Successfully reset prefix cache
+
+  9%|▉         | 97/1024 [7:10:59<67:24:21, 261.77s/it][A
+                                                       [A{'loss': 0.1669, 'grad_norm': 0.0021286753471940756, 'learning_rate': 1e-05, 'num_tokens': 33923784.0, 'completions/mean_length': 5751.265625, 'completions/min_length': 1114.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4850.1865234375, 'completions/min_terminated_length': 1114.0, 'completions/max_terminated_length': 14226.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.3025038540363312, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01443873718380928, 'sampling/sampling_logp_difference/max': 1.2766265869140625, 'sampling/importance_sampling_ratio/min': 0.27897679805755615, 'sampling/importance_sampling_ratio/mean': 0.999960720539093, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3344818912446499, 'clip_ratio/low_mean': 0.0001494620642006339, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.985071248280292e-05, 'clip_ratio/high_max': 0.00012197062733321218, 'clip_ratio/region_mean': 0.00018931277554656845, 'epoch': 0.04}
+
+  9%|▉         | 97/1024 [7:10:59<67:24:21, 261.77s/it][AINFO 12-02 08:53:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 98/1024 [7:13:22<58:13:12, 226.34s/it][A
+                                                       [A{'loss': 0.0362, 'grad_norm': 0.004585409536957741, 'learning_rate': 1e-05, 'num_tokens': 34126231.0, 'completions/mean_length': 3014.609375, 'completions/min_length': 874.0, 'completions/max_length': 10016.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3014.609375, 'completions/min_terminated_length': 874.0, 'completions/max_terminated_length': 10016.0, 'rewards/accuracy_reward/mean': 0.84375, 'rewards/accuracy_reward/std': 0.36596253514289856, 'reward': 0.84375, 'reward_std': 0.2709311842918396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.013324004597961903, 'sampling/sampling_logp_difference/max': 0.7330055236816406, 'sampling/importance_sampling_ratio/min': 0.48046278953552246, 'sampling/importance_sampling_ratio/mean': 0.9999639987945557, 'sampling/importance_sampling_ratio/max': 1.9451061487197876, 'entropy': 0.39649129286408424, 'clip_ratio/low_mean': 4.214366981614148e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.521566292074567e-05, 'clip_ratio/high_max': 0.00023780894935043762, 'clip_ratio/region_mean': 0.00012735933341900818, 'epoch': 0.05}
+
+ 10%|▉         | 98/1024 [7:13:22<58:13:12, 226.34s/it][AINFO 12-02 08:55:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:55:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:55:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:55:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 99/1024 [7:17:49<61:16:29, 238.47s/it][A
+                                                       [A{'loss': -0.0764, 'grad_norm': 0.0021621997002512217, 'learning_rate': 1e-05, 'num_tokens': 34516071.0, 'completions/mean_length': 5942.25, 'completions/min_length': 716.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5776.50830078125, 'completions/min_terminated_length': 716.0, 'completions/max_terminated_length': 15178.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.34034284949302673, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01659306511282921, 'sampling/sampling_logp_difference/max': 3.093405246734619, 'sampling/importance_sampling_ratio/min': 0.04534727334976196, 'sampling/importance_sampling_ratio/mean': 0.9998834133148193, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.38903985917568207, 'clip_ratio/low_mean': 0.0001592071539562312, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.785543698446418e-05, 'clip_ratio/high_max': 0.000270107318101509, 'clip_ratio/region_mean': 0.0002370625934418058, 'epoch': 0.05}
+
+ 10%|▉         | 99/1024 [7:17:49<61:16:29, 238.47s/it][AINFO 12-02 08:59:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 100/1024 [7:22:20<63:42:17, 248.20s/it][A
+                                                        [A{'loss': 0.0697, 'grad_norm': 0.0009822722058743238, 'learning_rate': 1e-05, 'num_tokens': 34833055.0, 'completions/mean_length': 4830.25, 'completions/min_length': 429.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4646.857421875, 'completions/min_terminated_length': 429.0, 'completions/max_terminated_length': 14630.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.25513991713523865, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016384180635213852, 'sampling/sampling_logp_difference/max': 2.9902172088623047, 'sampling/importance_sampling_ratio/min': 0.05027651786804199, 'sampling/importance_sampling_ratio/mean': 0.9998177886009216, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4372781030833721, 'clip_ratio/low_mean': 0.0002205613964179065, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.206907917454373e-05, 'clip_ratio/high_max': 0.00013749653044214938, 'clip_ratio/region_mean': 0.00027263047468295554, 'epoch': 0.05}
+
+ 10%|▉         | 100/1024 [7:22:20<63:42:17, 248.20s/it][AINFO 12-02 09:04:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:04:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:04:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:04:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 101/1024 [7:27:19<67:32:21, 263.42s/it][A
+                                                        [A{'loss': 0.0665, 'grad_norm': 0.002815033309161663, 'learning_rate': 1e-05, 'num_tokens': 35267159.0, 'completions/mean_length': 6606.375, 'completions/min_length': 743.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6290.9677734375, 'completions/min_terminated_length': 743.0, 'completions/max_terminated_length': 15387.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015393032692372799, 'sampling/sampling_logp_difference/max': 2.2247695922851562, 'sampling/importance_sampling_ratio/min': 0.10809232294559479, 'sampling/importance_sampling_ratio/mean': 0.9999589920043945, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.39939532428979874, 'clip_ratio/low_mean': 0.00024672951349202776, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6721513449956547e-05, 'clip_ratio/high_max': 8.850259200698929e-05, 'clip_ratio/region_mean': 0.0002734510235313792, 'epoch': 0.05}
+
+ 10%|▉         | 101/1024 [7:27:19<67:32:21, 263.42s/it][AINFO 12-02 09:09:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:09:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|▉         | 102/1024 [7:32:11<69:38:16, 271.90s/it][A
+                                                        [A{'loss': 0.0158, 'grad_norm': 0.002312592463567853, 'learning_rate': 1e-05, 'num_tokens': 35646158.0, 'completions/mean_length': 5721.484375, 'completions/min_length': 664.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4817.88134765625, 'completions/min_terminated_length': 664.0, 'completions/max_terminated_length': 16190.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.26196980476379395, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01828182116150856, 'sampling/sampling_logp_difference/max': 2.335542678833008, 'sampling/importance_sampling_ratio/min': 0.09675796329975128, 'sampling/importance_sampling_ratio/mean': 0.9999186992645264, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44412609189748764, 'clip_ratio/low_mean': 0.0001427162353593303, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.599432918439561e-05, 'clip_ratio/high_max': 0.0001562020088385907, 'clip_ratio/region_mean': 0.00018871055885938404, 'epoch': 0.05}
+
+ 10%|▉         | 102/1024 [7:32:11<69:38:16, 271.90s/it][AINFO 12-02 09:14:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 103/1024 [7:36:56<70:38:00, 276.09s/it][A
+                                                        [A{'loss': 0.0132, 'grad_norm': 0.001596050919033587, 'learning_rate': 1e-05, 'num_tokens': 35989479.0, 'completions/mean_length': 5174.515625, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4996.58740234375, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 15546.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.342454731464386, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017433863133192062, 'sampling/sampling_logp_difference/max': 1.4259519577026367, 'sampling/importance_sampling_ratio/min': 0.24027962982654572, 'sampling/importance_sampling_ratio/mean': 0.9999865889549255, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48807933926582336, 'clip_ratio/low_mean': 0.00024369892526010517, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.587288933384116e-05, 'clip_ratio/high_max': 0.0001761527619237313, 'clip_ratio/region_mean': 0.0002995718150486937, 'epoch': 0.05}
+
+ 10%|█         | 103/1024 [7:36:56<70:38:00, 276.09s/it][AINFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:19:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 104/1024 [7:39:24<60:44:37, 237.69s/it][A
+                                                        [A{'loss': 0.0246, 'grad_norm': 0.002243574010208249, 'learning_rate': 1e-05, 'num_tokens': 36252404.0, 'completions/mean_length': 3940.703125, 'completions/min_length': 589.0, 'completions/max_length': 9683.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3940.703125, 'completions/min_terminated_length': 589.0, 'completions/max_terminated_length': 9683.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.3403330445289612, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017668718472123146, 'sampling/sampling_logp_difference/max': 5.725813865661621, 'sampling/importance_sampling_ratio/min': 0.003260698402300477, 'sampling/importance_sampling_ratio/mean': 0.9999291896820068, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.542726919054985, 'clip_ratio/low_mean': 0.00021115628123880015, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.382280296042154e-05, 'clip_ratio/high_max': 0.00020642880645027617, 'clip_ratio/region_mean': 0.00028497908260760596, 'epoch': 0.05}
+
+ 10%|█         | 104/1024 [7:39:24<60:44:37, 237.69s/it][AINFO 12-02 09:21:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:21:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:21:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:21:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 105/1024 [7:44:14<64:40:42, 253.37s/it][A
+                                                        [A{'loss': 0.1312, 'grad_norm': 0.003278986318036914, 'learning_rate': 1e-05, 'num_tokens': 36600876.0, 'completions/mean_length': 5294.625, 'completions/min_length': 196.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 4555.33349609375, 'completions/min_terminated_length': 196.0, 'completions/max_terminated_length': 15821.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.35612428188323975, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017758361995220184, 'sampling/sampling_logp_difference/max': 1.9388784170150757, 'sampling/importance_sampling_ratio/min': 0.14386522769927979, 'sampling/importance_sampling_ratio/mean': 0.9998038411140442, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.49857256934046745, 'clip_ratio/low_mean': 0.0002851912704500137, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.760785705253511e-05, 'clip_ratio/high_max': 0.000100779246622551, 'clip_ratio/region_mean': 0.00032279912920785137, 'epoch': 0.05}
+
+ 10%|█         | 105/1024 [7:44:14<64:40:42, 253.37s/it][AINFO 12-02 09:26:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:26:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:26:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:26:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 106/1024 [7:49:03<67:19:23, 264.01s/it][A
+                                                        [A{'loss': 0.0033, 'grad_norm': 0.0010947652626782656, 'learning_rate': 1e-05, 'num_tokens': 37082643.0, 'completions/mean_length': 7378.609375, 'completions/min_length': 347.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6778.25048828125, 'completions/min_terminated_length': 347.0, 'completions/max_terminated_length': 15982.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.390625, 'reward_std': 0.31512534618377686, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02263464778661728, 'sampling/sampling_logp_difference/max': 2.542645215988159, 'sampling/importance_sampling_ratio/min': 0.07865805923938751, 'sampling/importance_sampling_ratio/mean': 1.0001232624053955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5822085291147232, 'clip_ratio/low_mean': 0.0004054581149830483, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.477953760011587e-05, 'clip_ratio/high_max': 0.00012376565791782923, 'clip_ratio/region_mean': 0.00044023765258316416, 'epoch': 0.05}
+
+ 10%|█         | 106/1024 [7:49:03<67:19:23, 264.01s/it][AINFO 12-02 09:31:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:31:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:31:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:31:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 10%|█         | 107/1024 [7:53:56<69:24:25, 272.48s/it][A
+                                                        [A{'loss': -0.0591, 'grad_norm': 0.0006394436350092292, 'learning_rate': 1e-05, 'num_tokens': 37531330.0, 'completions/mean_length': 6857.859375, 'completions/min_length': 771.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6050.5595703125, 'completions/min_terminated_length': 771.0, 'completions/max_terminated_length': 14986.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.375, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019778329879045486, 'sampling/sampling_logp_difference/max': 2.0357680320739746, 'sampling/importance_sampling_ratio/min': 0.13058015704154968, 'sampling/importance_sampling_ratio/mean': 1.000154733657837, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48234212026000023, 'clip_ratio/low_mean': 0.00012846527351939585, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.09765074589086e-05, 'clip_ratio/high_max': 0.00018101240948453778, 'clip_ratio/region_mean': 0.00019944178256992018, 'epoch': 0.05}
+
+ 10%|█         | 107/1024 [7:53:56<69:24:25, 272.48s/it][AINFO 12-02 09:36:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:36:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:36:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:36:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 108/1024 [7:58:38<70:04:32, 275.41s/it][A
+                                                        [A{'loss': -0.0237, 'grad_norm': 0.004839874338358641, 'learning_rate': 1e-05, 'num_tokens': 37804146.0, 'completions/mean_length': 4130.375, 'completions/min_length': 559.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 3527.737548828125, 'completions/min_terminated_length': 559.0, 'completions/max_terminated_length': 13267.0, 'rewards/accuracy_reward/mean': 0.75, 'rewards/accuracy_reward/std': 0.4364357888698578, 'reward': 0.75, 'reward_std': 0.35824596881866455, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014850424602627754, 'sampling/sampling_logp_difference/max': 2.3441402912139893, 'sampling/importance_sampling_ratio/min': 0.1985471248626709, 'sampling/importance_sampling_ratio/mean': 0.9998599290847778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.38795367255806923, 'clip_ratio/low_mean': 0.0001764288294907601, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.048349615004554e-05, 'clip_ratio/high_max': 0.00013104493973514764, 'clip_ratio/region_mean': 0.00021691232859666343, 'epoch': 0.05}
+
+ 11%|█         | 108/1024 [7:58:38<70:04:32, 275.41s/it][AINFO 12-02 09:40:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 109/1024 [8:03:33<71:28:56, 281.24s/it][A
+                                                        [A{'loss': -0.0148, 'grad_norm': 0.000928734487388283, 'learning_rate': 1e-05, 'num_tokens': 38183056.0, 'completions/mean_length': 5768.71875, 'completions/min_length': 791.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5246.6552734375, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 16301.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.2198973000049591, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.016724728047847748, 'sampling/sampling_logp_difference/max': 2.2236571311950684, 'sampling/importance_sampling_ratio/min': 0.10821263492107391, 'sampling/importance_sampling_ratio/mean': 0.9998956918716431, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4357805512845516, 'clip_ratio/low_mean': 0.00013507822859537555, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.282327022840036e-05, 'clip_ratio/high_max': 9.129308091360144e-05, 'clip_ratio/region_mean': 0.0001579014979142812, 'epoch': 0.05}
+
+ 11%|█         | 109/1024 [8:03:33<71:28:56, 281.24s/it][AINFO 12-02 09:45:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:45:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:45:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:45:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 110/1024 [8:07:54<69:53:03, 275.26s/it][A
+                                                        [A{'loss': 0.1324, 'grad_norm': 0.0031951405107975006, 'learning_rate': 1e-05, 'num_tokens': 38418327.0, 'completions/mean_length': 3523.984375, 'completions/min_length': 442.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 3109.14501953125, 'completions/min_terminated_length': 442.0, 'completions/max_terminated_length': 15593.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.4429643750190735, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015087015926837921, 'sampling/sampling_logp_difference/max': 1.1892004013061523, 'sampling/importance_sampling_ratio/min': 0.3044646084308624, 'sampling/importance_sampling_ratio/mean': 1.0001622438430786, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44223199039697647, 'clip_ratio/low_mean': 0.00023108526693249587, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.494255310622975e-05, 'clip_ratio/high_max': 0.0001581449523655465, 'clip_ratio/region_mean': 0.000286027821857715, 'epoch': 0.05}
+
+ 11%|█         | 110/1024 [8:07:54<69:53:03, 275.26s/it][AINFO 12-02 09:50:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:50:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:50:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:50:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 111/1024 [8:11:52<66:58:29, 264.08s/it][A
+                                                        [A{'loss': 0.1445, 'grad_norm': 0.0006922043394297361, 'learning_rate': 1e-05, 'num_tokens': 38632292.0, 'completions/mean_length': 3221.078125, 'completions/min_length': 416.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3012.14306640625, 'completions/min_terminated_length': 416.0, 'completions/max_terminated_length': 10932.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.31512534618377686, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.013242723420262337, 'sampling/sampling_logp_difference/max': 1.1476330757141113, 'sampling/importance_sampling_ratio/min': 0.31738710403442383, 'sampling/importance_sampling_ratio/mean': 1.0000813007354736, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3312498927116394, 'clip_ratio/low_mean': 7.798392653057817e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3727985535515472e-05, 'clip_ratio/high_max': 4.4822829295299016e-05, 'clip_ratio/region_mean': 0.00010171191206609365, 'epoch': 0.05}
+
+ 11%|█         | 111/1024 [8:11:52<66:58:29, 264.08s/it][AINFO 12-02 09:53:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:53:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:53:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:53:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 112/1024 [8:17:18<71:35:07, 282.57s/it][A
+                                                        [A{'loss': 0.2115, 'grad_norm': 0.0019497789908200502, 'learning_rate': 1e-05, 'num_tokens': 39201527.0, 'completions/mean_length': 8647.046875, 'completions/min_length': 1452.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.15625, 'completions/mean_terminated_length': 7224.31494140625, 'completions/min_terminated_length': 1452.0, 'completions/max_terminated_length': 16182.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.4739636480808258, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.018058083951473236, 'sampling/sampling_logp_difference/max': 3.9558448791503906, 'sampling/importance_sampling_ratio/min': 0.01914248801767826, 'sampling/importance_sampling_ratio/mean': 0.9999226331710815, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4458686374127865, 'clip_ratio/low_mean': 0.0004752590848511318, 'clip_ratio/low_min': 0.00019177497779310215, 'clip_ratio/high_mean': 1.742804090554273e-05, 'clip_ratio/high_max': 6.142193933555973e-05, 'clip_ratio/region_mean': 0.0004926871188217774, 'epoch': 0.05}
+
+ 11%|█         | 112/1024 [8:17:18<71:35:07, 282.57s/it][AINFO 12-02 09:59:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:59:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:59:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:59:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 113/1024 [8:22:03<71:42:55, 283.40s/it][A
+                                                        [A{'loss': 0.066, 'grad_norm': 0.0021070586517453194, 'learning_rate': 1e-05, 'num_tokens': 39634000.0, 'completions/mean_length': 6591.515625, 'completions/min_length': 396.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5938.68359375, 'completions/min_terminated_length': 396.0, 'completions/max_terminated_length': 12968.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5039526224136353, 'reward': 0.5, 'reward_std': 0.3682710528373718, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018126633018255234, 'sampling/sampling_logp_difference/max': 1.9993243217468262, 'sampling/importance_sampling_ratio/min': 0.13542675971984863, 'sampling/importance_sampling_ratio/mean': 1.000077247619629, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4837340675294399, 'clip_ratio/low_mean': 0.0002601412379590329, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7830886665469734e-05, 'clip_ratio/high_max': 8.686609453434357e-05, 'clip_ratio/region_mean': 0.00028797212507924996, 'epoch': 0.05}
+
+ 11%|█         | 113/1024 [8:22:03<71:42:55, 283.40s/it][AINFO 12-02 10:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 114/1024 [8:26:44<71:25:41, 282.57s/it][A
+                                                        [A{'loss': 0.0179, 'grad_norm': 0.0020677978172898293, 'learning_rate': 1e-05, 'num_tokens': 40030917.0, 'completions/mean_length': 6067.453125, 'completions/min_length': 363.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5560.08154296875, 'completions/min_terminated_length': 363.0, 'completions/max_terminated_length': 13965.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.23144522309303284, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.016291355714201927, 'sampling/sampling_logp_difference/max': 2.1550464630126953, 'sampling/importance_sampling_ratio/min': 0.11589780449867249, 'sampling/importance_sampling_ratio/mean': 1.000042200088501, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46337443962693214, 'clip_ratio/low_mean': 0.0002676048191005975, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5739461761986604e-05, 'clip_ratio/high_max': 0.00010295784704794642, 'clip_ratio/region_mean': 0.0002933442801804631, 'epoch': 0.05}
+
+ 11%|█         | 114/1024 [8:26:44<71:25:41, 282.57s/it][AINFO 12-02 10:08:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:08:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:08:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:08:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█         | 115/1024 [8:30:47<68:23:33, 270.86s/it][A
+                                                        [A{'loss': -0.0068, 'grad_norm': 0.001437116996385157, 'learning_rate': 1e-05, 'num_tokens': 40299079.0, 'completions/mean_length': 4049.28125, 'completions/min_length': 529.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3853.4921875, 'completions/min_terminated_length': 529.0, 'completions/max_terminated_length': 12560.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.43768274784088135, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.01563834585249424, 'sampling/sampling_logp_difference/max': 1.8743491172790527, 'sampling/importance_sampling_ratio/min': 0.15345482528209686, 'sampling/importance_sampling_ratio/mean': 0.9999523758888245, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.41926733776926994, 'clip_ratio/low_mean': 0.0002765881308732787, 'clip_ratio/low_min': 1.9531249563442543e-05, 'clip_ratio/high_mean': 4.667774396693858e-05, 'clip_ratio/high_max': 0.00018671097586775431, 'clip_ratio/region_mean': 0.0003232658746128436, 'epoch': 0.05}
+
+ 11%|█         | 115/1024 [8:30:47<68:23:33, 270.86s/it][AINFO 12-02 10:12:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:12:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:12:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:12:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█▏        | 116/1024 [8:33:19<59:20:16, 235.26s/it][A
+                                                        [A{'loss': 0.0093, 'grad_norm': 0.004336320795118809, 'learning_rate': 1e-05, 'num_tokens': 40500591.0, 'completions/mean_length': 3004.875, 'completions/min_length': 393.0, 'completions/max_length': 10462.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3004.875, 'completions/min_terminated_length': 393.0, 'completions/max_terminated_length': 10462.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.31300368905067444, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016662519425153732, 'sampling/sampling_logp_difference/max': 1.2405017614364624, 'sampling/importance_sampling_ratio/min': 0.2892390489578247, 'sampling/importance_sampling_ratio/mean': 1.0000367164611816, 'sampling/importance_sampling_ratio/max': 1.8102816343307495, 'entropy': 0.5190348252654076, 'clip_ratio/low_mean': 9.401034458278446e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.693717907386599e-05, 'clip_ratio/high_max': 0.00024880793534975965, 'clip_ratio/region_mean': 0.00017094753002311336, 'epoch': 0.05}
+
+ 11%|█▏        | 116/1024 [8:33:19<59:20:16, 235.26s/it][AINFO 12-02 10:15:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:15:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:15:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:15:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 11%|█▏        | 117/1024 [8:38:03<62:57:25, 249.88s/it][A
+                                                        [A{'loss': 0.0159, 'grad_norm': 0.0011345910606905818, 'learning_rate': 1e-05, 'num_tokens': 40909911.0, 'completions/mean_length': 6247.625, 'completions/min_length': 1169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5749.1142578125, 'completions/min_terminated_length': 1169.0, 'completions/max_terminated_length': 16375.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.22461533546447754, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020127378404140472, 'sampling/sampling_logp_difference/max': 4.889856338500977, 'sampling/importance_sampling_ratio/min': 0.00752250337973237, 'sampling/importance_sampling_ratio/mean': 1.0000286102294922, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5587562657892704, 'clip_ratio/low_mean': 0.0001268851310669561, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.083619048993569e-05, 'clip_ratio/high_max': 0.00010754497816378716, 'clip_ratio/region_mean': 0.00015772131973790238, 'epoch': 0.05}
+
+ 11%|█▏        | 117/1024 [8:38:03<62:57:25, 249.88s/it][AINFO 12-02 10:20:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:20:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:20:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:20:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 118/1024 [8:41:51<61:10:59, 243.11s/it][A
+                                                        [A{'loss': 0.1119, 'grad_norm': 0.002775671426206827, 'learning_rate': 1e-05, 'num_tokens': 41145292.0, 'completions/mean_length': 3550.078125, 'completions/min_length': 298.0, 'completions/max_length': 15139.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3550.078125, 'completions/min_terminated_length': 298.0, 'completions/max_terminated_length': 15139.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.4760853052139282, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.014819844625890255, 'sampling/sampling_logp_difference/max': 1.4045265913009644, 'sampling/importance_sampling_ratio/min': 0.24548324942588806, 'sampling/importance_sampling_ratio/mean': 0.999993622303009, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.414435300976038, 'clip_ratio/low_mean': 0.00021231729988357984, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.442964649977512e-05, 'clip_ratio/high_max': 0.0001580848420417169, 'clip_ratio/region_mean': 0.0002767469468381023, 'epoch': 0.05}
+
+ 12%|█▏        | 118/1024 [8:41:51<61:10:59, 243.11s/it][AINFO 12-02 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 119/1024 [8:46:17<62:53:16, 250.16s/it][A
+                                                        [A{'loss': 0.0153, 'grad_norm': 0.004539367742836475, 'learning_rate': 1e-05, 'num_tokens': 41524002.0, 'completions/mean_length': 5776.59375, 'completions/min_length': 1076.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5608.22265625, 'completions/min_terminated_length': 1076.0, 'completions/max_terminated_length': 16243.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5039526224136353, 'reward': 0.5, 'reward_std': 0.35400262475013733, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018392477184534073, 'sampling/sampling_logp_difference/max': 2.573674201965332, 'sampling/importance_sampling_ratio/min': 0.07625485211610794, 'sampling/importance_sampling_ratio/mean': 1.0002080202102661, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5631212890148163, 'clip_ratio/low_mean': 0.0002201834695370053, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.9602702322081313e-05, 'clip_ratio/high_max': 0.0001554040018163505, 'clip_ratio/region_mean': 0.0002797861670842394, 'epoch': 0.05}
+
+ 12%|█▏        | 119/1024 [8:46:17<62:53:16, 250.16s/it][AINFO 12-02 10:28:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:28:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:28:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:28:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 120/1024 [8:50:34<63:19:04, 252.15s/it][A
+                                                        [A{'loss': 0.1003, 'grad_norm': 0.003369415644556284, 'learning_rate': 1e-05, 'num_tokens': 41795663.0, 'completions/mean_length': 4103.203125, 'completions/min_length': 418.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3908.27001953125, 'completions/min_terminated_length': 418.0, 'completions/max_terminated_length': 15220.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.45817241072654724, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.013474097475409508, 'sampling/sampling_logp_difference/max': 2.232908248901367, 'sampling/importance_sampling_ratio/min': 0.10721616446971893, 'sampling/importance_sampling_ratio/mean': 0.9999425411224365, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.35451220348477364, 'clip_ratio/low_mean': 0.0001439079987903824, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.00010172213319492585, 'clip_ratio/high_max': 0.00027204485922993626, 'clip_ratio/region_mean': 0.000245630133576924, 'epoch': 0.06}
+
+ 12%|█▏        | 120/1024 [8:50:34<63:19:04, 252.15s/it][AINFO 12-02 10:32:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:32:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:32:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:32:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 121/1024 [8:53:59<59:42:58, 238.07s/it][A
+                                                        [A{'loss': -0.0472, 'grad_norm': 0.0026842444203794003, 'learning_rate': 1e-05, 'num_tokens': 42019173.0, 'completions/mean_length': 3360.84375, 'completions/min_length': 856.0, 'completions/max_length': 13773.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3360.84375, 'completions/min_terminated_length': 856.0, 'completions/max_terminated_length': 13773.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.3766237497329712, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014354024082422256, 'sampling/sampling_logp_difference/max': 1.2142245769500732, 'sampling/importance_sampling_ratio/min': 0.2969401776790619, 'sampling/importance_sampling_ratio/mean': 0.9999501705169678, 'sampling/importance_sampling_ratio/max': 1.910190463066101, 'entropy': 0.3747357986867428, 'clip_ratio/low_mean': 0.0001754828499542782, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9992395411682082e-05, 'clip_ratio/high_max': 0.00011996958164672833, 'clip_ratio/region_mean': 0.00020547524491121294, 'epoch': 0.06}
+
+ 12%|█▏        | 121/1024 [8:53:59<59:42:58, 238.07s/it][AINFO 12-02 10:36:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:36:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 122/1024 [8:59:25<66:16:21, 264.50s/it][A
+                                                        [A{'loss': 0.0171, 'grad_norm': 0.0007615160429850221, 'learning_rate': 1e-05, 'num_tokens': 42483081.0, 'completions/mean_length': 7079.3125, 'completions/min_length': 618.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 6116.7587890625, 'completions/min_terminated_length': 618.0, 'completions/max_terminated_length': 15477.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.292504221200943, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01558875385671854, 'sampling/sampling_logp_difference/max': 2.108236789703369, 'sampling/importance_sampling_ratio/min': 0.12145192176103592, 'sampling/importance_sampling_ratio/mean': 0.999954104423523, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4430743120610714, 'clip_ratio/low_mean': 6.642685821134364e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.125676443640259e-05, 'clip_ratio/high_max': 0.00016021558167267358, 'clip_ratio/region_mean': 0.00011768362469410931, 'epoch': 0.06}
+
+ 12%|█▏        | 122/1024 [8:59:25<66:16:21, 264.50s/it][AINFO 12-02 10:41:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:41:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:41:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:41:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 123/1024 [9:04:12<67:51:22, 271.12s/it][A
+                                                        [A{'loss': 0.019, 'grad_norm': 0.0020743575878441334, 'learning_rate': 1e-05, 'num_tokens': 43015343.0, 'completions/mean_length': 8170.84375, 'completions/min_length': 1518.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 7162.21044921875, 'completions/min_terminated_length': 1518.0, 'completions/max_terminated_length': 15009.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.46875, 'reward_std': 0.25513991713523865, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018536187708377838, 'sampling/sampling_logp_difference/max': 4.507608413696289, 'sampling/importance_sampling_ratio/min': 0.011024795472621918, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44009917601943016, 'clip_ratio/low_mean': 0.00011833181906695245, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.094606271744851e-05, 'clip_ratio/high_max': 0.0001886956474663748, 'clip_ratio/region_mean': 0.0001792778818980878, 'epoch': 0.06}
+
+ 12%|█▏        | 123/1024 [9:04:12<67:51:22, 271.12s/it][AINFO 12-02 10:46:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:46:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:46:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:46:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 124/1024 [9:08:44<67:51:07, 271.41s/it][A
+                                                        [A{'loss': 0.0933, 'grad_norm': 0.004925912246108055, 'learning_rate': 1e-05, 'num_tokens': 43340126.0, 'completions/mean_length': 4933.484375, 'completions/min_length': 733.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4564.11279296875, 'completions/min_terminated_length': 733.0, 'completions/max_terminated_length': 14005.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.44663429260253906, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.017598867416381836, 'sampling/sampling_logp_difference/max': 7.938327789306641, 'sampling/importance_sampling_ratio/min': 0.00035680262953974307, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4698144569993019, 'clip_ratio/low_mean': 0.00036794902007386554, 'clip_ratio/low_min': 3.2104788260767236e-05, 'clip_ratio/high_mean': 7.22222671356576e-05, 'clip_ratio/high_max': 0.00023809738740965258, 'clip_ratio/region_mean': 0.000440171292211744, 'epoch': 0.06}
+
+ 12%|█▏        | 124/1024 [9:08:44<67:51:07, 271.41s/it][AINFO 12-02 10:50:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:50:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:50:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:50:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 125/1024 [9:12:54<66:10:56, 265.02s/it][A
+                                                        [A{'loss': -0.0471, 'grad_norm': 0.002808566903695464, 'learning_rate': 1e-05, 'num_tokens': 43641706.0, 'completions/mean_length': 4553.4375, 'completions/min_length': 660.0, 'completions/max_length': 14699.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4553.4375, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 14699.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.41610971093177795, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015545728616416454, 'sampling/sampling_logp_difference/max': 2.0921201705932617, 'sampling/importance_sampling_ratio/min': 0.12342517077922821, 'sampling/importance_sampling_ratio/mean': 0.999893069267273, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44897253811359406, 'clip_ratio/low_mean': 0.00018211181577498792, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0001021475932247995, 'clip_ratio/high_max': 0.0002686334864847595, 'clip_ratio/region_mean': 0.00028425940945453476, 'epoch': 0.06}
+
+ 12%|█▏        | 125/1024 [9:12:54<66:10:56, 265.02s/it][AINFO 12-02 10:55:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 126/1024 [9:18:38<71:58:53, 288.57s/it][A
+                                                        [A{'loss': 0.0355, 'grad_norm': 0.0013998980866745114, 'learning_rate': 1e-05, 'num_tokens': 44244625.0, 'completions/mean_length': 9259.359375, 'completions/min_length': 488.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.203125, 'completions/mean_terminated_length': 7443.27490234375, 'completions/min_terminated_length': 488.0, 'completions/max_terminated_length': 16092.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.421875, 'reward_std': 0.23144522309303284, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018161531537771225, 'sampling/sampling_logp_difference/max': 2.7153706550598145, 'sampling/importance_sampling_ratio/min': 0.06618041545152664, 'sampling/importance_sampling_ratio/mean': 0.9999784231185913, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46115776151418686, 'clip_ratio/low_mean': 0.00014563434706360567, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5452087331577786e-05, 'clip_ratio/high_max': 9.338358086097287e-05, 'clip_ratio/region_mean': 0.0001810864348499308, 'epoch': 0.06}
+
+ 12%|█▏        | 126/1024 [9:18:38<71:58:53, 288.57s/it][AINFO 12-02 11:00:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▏        | 127/1024 [9:23:47<73:27:42, 294.83s/it][A
+                                                        [A{'loss': 0.003, 'grad_norm': 0.0004830281832255423, 'learning_rate': 1e-05, 'num_tokens': 44654850.0, 'completions/mean_length': 6263.140625, 'completions/min_length': 1301.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.140625, 'completions/mean_terminated_length': 4607.0, 'completions/min_terminated_length': 1301.0, 'completions/max_terminated_length': 16146.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.2198973000049591, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0174238421022892, 'sampling/sampling_logp_difference/max': 16.086275100708008, 'sampling/importance_sampling_ratio/min': 1.0323322641170307e-07, 'sampling/importance_sampling_ratio/mean': 0.9999237060546875, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4873559921979904, 'clip_ratio/low_mean': 0.00019511632126523182, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5868145459971856e-05, 'clip_ratio/high_max': 0.00013696555470232852, 'clip_ratio/region_mean': 0.00023098446877156675, 'epoch': 0.06}
+
+ 12%|█▏        | 127/1024 [9:23:47<73:27:42, 294.83s/it][AINFO 12-02 11:05:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:05:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:05:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:05:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 12%|█▎        | 128/1024 [9:27:13<66:43:01, 268.06s/it][A
+                                                        [A{'loss': 0.0573, 'grad_norm': 0.0018985194619745016, 'learning_rate': 1e-05, 'num_tokens': 44916684.0, 'completions/mean_length': 3940.53125, 'completions/min_length': 826.0, 'completions/max_length': 12493.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3940.53125, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 12493.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.359375, 'reward_std': 0.23144522309303284, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.014052385464310646, 'sampling/sampling_logp_difference/max': 1.1099934577941895, 'sampling/importance_sampling_ratio/min': 0.32956111431121826, 'sampling/importance_sampling_ratio/mean': 1.0001055002212524, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43739553540945053, 'clip_ratio/low_mean': 0.00014973171028032084, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.134717644570628e-06, 'clip_ratio/high_max': 1.2538870578282513e-05, 'clip_ratio/region_mean': 0.00015286642747014412, 'epoch': 0.06}
+
+ 12%|█▎        | 128/1024 [9:27:13<66:43:01, 268.06s/it][AINFO 12-02 11:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 129/1024 [9:31:01<63:40:02, 256.09s/it][A
+                                                        [A{'loss': -0.088, 'grad_norm': 0.0027137096039950848, 'learning_rate': 1e-05, 'num_tokens': 45201119.0, 'completions/mean_length': 4282.671875, 'completions/min_length': 598.0, 'completions/max_length': 14468.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4282.671875, 'completions/min_terminated_length': 598.0, 'completions/max_terminated_length': 14468.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.47868168354034424, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.015467626042664051, 'sampling/sampling_logp_difference/max': 1.4864976406097412, 'sampling/importance_sampling_ratio/min': 0.22616338729858398, 'sampling/importance_sampling_ratio/mean': 0.9999327659606934, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40696533769369125, 'clip_ratio/low_mean': 0.0002405235263722716, 'clip_ratio/low_min': 1.1933174391742796e-05, 'clip_ratio/high_mean': 6.640903359311778e-05, 'clip_ratio/high_max': 0.00019584240590120316, 'clip_ratio/region_mean': 0.0003069325557589764, 'epoch': 0.06}
+
+ 13%|█▎        | 129/1024 [9:31:01<63:40:02, 256.09s/it][AINFO 12-02 11:13:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:13:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:13:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:13:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 130/1024 [9:35:23<64:03:37, 257.96s/it][A
+                                                        [A{'loss': -0.0213, 'grad_norm': 0.0018277267226949334, 'learning_rate': 1e-05, 'num_tokens': 45488317.0, 'completions/mean_length': 4351.84375, 'completions/min_length': 334.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 3760.09814453125, 'completions/min_terminated_length': 334.0, 'completions/max_terminated_length': 11732.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.359375, 'reward_std': 0.17782479524612427, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01826019026339054, 'sampling/sampling_logp_difference/max': 5.063012599945068, 'sampling/importance_sampling_ratio/min': 0.006326471455395222, 'sampling/importance_sampling_ratio/mean': 1.0002045631408691, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5931680500507355, 'clip_ratio/low_mean': 4.536641699814936e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.536641699814936e-05, 'epoch': 0.06}
+
+ 13%|█▎        | 130/1024 [9:35:23<64:03:37, 257.96s/it][AINFO 12-02 11:17:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:17:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:17:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:17:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 131/1024 [9:39:50<64:36:40, 260.47s/it][A
+                                                        [A{'loss': 0.0302, 'grad_norm': 0.0010811651591211557, 'learning_rate': 1e-05, 'num_tokens': 45864655.0, 'completions/mean_length': 5721.53125, 'completions/min_length': 980.0, 'completions/max_length': 14945.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5721.53125, 'completions/min_terminated_length': 980.0, 'completions/max_terminated_length': 14945.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.19727617502212524, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.016715798527002335, 'sampling/sampling_logp_difference/max': 2.1411678791046143, 'sampling/importance_sampling_ratio/min': 0.11751751601696014, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4441712759435177, 'clip_ratio/low_mean': 0.00025906614428095054, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9011744118179195e-06, 'clip_ratio/high_max': 1.1604697647271678e-05, 'clip_ratio/region_mean': 0.00026196731869276846, 'epoch': 0.06}
+
+ 13%|█▎        | 131/1024 [9:39:50<64:36:40, 260.47s/it][AINFO 12-02 11:21:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 132/1024 [9:44:08<64:24:39, 259.95s/it][A
+                                                        [A{'loss': 0.0572, 'grad_norm': 0.0036623927298933268, 'learning_rate': 1e-05, 'num_tokens': 46220322.0, 'completions/mean_length': 5409.546875, 'completions/min_length': 1058.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5235.349609375, 'completions/min_terminated_length': 1058.0, 'completions/max_terminated_length': 14638.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.4024401307106018, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017585471272468567, 'sampling/sampling_logp_difference/max': 1.7962360382080078, 'sampling/importance_sampling_ratio/min': 0.16592223942279816, 'sampling/importance_sampling_ratio/mean': 0.9999955892562866, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5009097345173359, 'clip_ratio/low_mean': 0.0002866962449843413, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.9982790642388863e-05, 'clip_ratio/high_max': 0.00016395169041061308, 'clip_ratio/region_mean': 0.0003366790297150146, 'epoch': 0.06}
+
+ 13%|█▎        | 132/1024 [9:44:08<64:24:39, 259.95s/it][AINFO 12-02 11:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 133/1024 [9:48:54<66:13:13, 267.56s/it][A
+                                                        [A{'loss': -0.004, 'grad_norm': 0.0012894254177808762, 'learning_rate': 1e-05, 'num_tokens': 46737234.0, 'completions/mean_length': 7923.25, 'completions/min_length': 2274.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7788.95263671875, 'completions/min_terminated_length': 2274.0, 'completions/max_terminated_length': 15769.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.3403330445289612, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02035873383283615, 'sampling/sampling_logp_difference/max': 1.3556551933288574, 'sampling/importance_sampling_ratio/min': 0.2685960531234741, 'sampling/importance_sampling_ratio/mean': 1.0000241994857788, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5559198185801506, 'clip_ratio/low_mean': 0.00022198507758730557, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.62535253468377e-05, 'clip_ratio/high_max': 0.00021282040142978076, 'clip_ratio/region_mean': 0.0002882385942939436, 'epoch': 0.06}
+
+ 13%|█▎        | 133/1024 [9:48:54<66:13:13, 267.56s/it][AINFO 12-02 11:30:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:30:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:30:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:30:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 134/1024 [9:52:31<62:26:51, 252.60s/it][A
+                                                        [A{'loss': 0.0402, 'grad_norm': 0.0010989592410624027, 'learning_rate': 1e-05, 'num_tokens': 47011610.0, 'completions/mean_length': 4135.375, 'completions/min_length': 798.0, 'completions/max_length': 14945.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4135.375, 'completions/min_terminated_length': 798.0, 'completions/max_terminated_length': 14945.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.37981897592544556, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015536255203187466, 'sampling/sampling_logp_difference/max': 2.166813373565674, 'sampling/importance_sampling_ratio/min': 0.11454203724861145, 'sampling/importance_sampling_ratio/mean': 1.0000196695327759, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.393577978014946, 'clip_ratio/low_mean': 8.085949048108887e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.247289982435177e-05, 'clip_ratio/high_max': 0.00018403001740807667, 'clip_ratio/region_mean': 0.00013333239030544064, 'epoch': 0.06}
+
+ 13%|█▎        | 134/1024 [9:52:31<62:26:51, 252.60s/it][AINFO 12-02 11:34:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:34:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:34:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:34:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 135/1024 [9:57:01<63:39:40, 257.80s/it][A
+                                                        [A{'loss': -0.0157, 'grad_norm': 0.0007335072150453925, 'learning_rate': 1e-05, 'num_tokens': 47400891.0, 'completions/mean_length': 5928.640625, 'completions/min_length': 1024.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5762.68310546875, 'completions/min_terminated_length': 1024.0, 'completions/max_terminated_length': 14844.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.31300368905067444, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01867317594587803, 'sampling/sampling_logp_difference/max': 13.749968528747559, 'sampling/importance_sampling_ratio/min': 1.0677375712475623e-06, 'sampling/importance_sampling_ratio/mean': 0.9999682307243347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48996758460998535, 'clip_ratio/low_mean': 0.00020736139549626387, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.1033524843878695e-05, 'clip_ratio/high_max': 0.00018842957706510788, 'clip_ratio/region_mean': 0.00026839491783903213, 'epoch': 0.06}
+
+ 13%|█▎        | 135/1024 [9:57:01<63:39:40, 257.80s/it][AINFO 12-02 11:39:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:39:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:39:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:39:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 136/1024 [10:02:20<68:04:00, 275.95s/it][A
+                                                         [A{'loss': 0.0173, 'grad_norm': 0.0018435430247336626, 'learning_rate': 1e-05, 'num_tokens': 47832280.0, 'completions/mean_length': 6600.453125, 'completions/min_length': 1041.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 5202.8037109375, 'completions/min_terminated_length': 1041.0, 'completions/max_terminated_length': 15929.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.4071483612060547, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.012917717918753624, 'sampling/sampling_logp_difference/max': 2.2161450386047363, 'sampling/importance_sampling_ratio/min': 0.10902860015630722, 'sampling/importance_sampling_ratio/mean': 0.9998989701271057, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.30898283794522285, 'clip_ratio/low_mean': 0.0002191008043155307, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.886457165615866e-05, 'clip_ratio/high_max': 0.00016620574297121493, 'clip_ratio/region_mean': 0.00027796537688118406, 'epoch': 0.06}
+
+ 13%|█▎        | 136/1024 [10:02:20<68:04:00, 275.95s/it][AINFO 12-02 11:44:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 137/1024 [10:06:27<65:51:45, 267.31s/it][A
+                                                         [A{'loss': -0.0668, 'grad_norm': 0.0033298067282885313, 'learning_rate': 1e-05, 'num_tokens': 48102683.0, 'completions/mean_length': 4061.296875, 'completions/min_length': 630.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 3663.790283203125, 'completions/min_terminated_length': 630.0, 'completions/max_terminated_length': 11954.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.40822193026542664, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.014145282097160816, 'sampling/sampling_logp_difference/max': 1.5411853790283203, 'sampling/importance_sampling_ratio/min': 0.26221078634262085, 'sampling/importance_sampling_ratio/mean': 0.9999774098396301, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40886691212654114, 'clip_ratio/low_mean': 0.0001422199188709783, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.873433113061765e-05, 'clip_ratio/high_max': 0.00016187672281375853, 'clip_ratio/region_mean': 0.0001909542522753327, 'epoch': 0.06}
+
+ 13%|█▎        | 137/1024 [10:06:27<65:51:45, 267.31s/it][AINFO 12-02 11:48:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:48:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:48:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:48:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 13%|█▎        | 138/1024 [10:11:07<66:45:03, 271.22s/it][A
+                                                         [A{'loss': 0.0076, 'grad_norm': 0.0028656241483986378, 'learning_rate': 1e-05, 'num_tokens': 48558389.0, 'completions/mean_length': 6955.78125, 'completions/min_length': 853.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6651.64501953125, 'completions/min_terminated_length': 853.0, 'completions/max_terminated_length': 15859.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.40625, 'reward_std': 0.1735912710428238, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019980821758508682, 'sampling/sampling_logp_difference/max': 3.4218673706054688, 'sampling/importance_sampling_ratio/min': 0.03265140578150749, 'sampling/importance_sampling_ratio/mean': 1.0000100135803223, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5771390646696091, 'clip_ratio/low_mean': 0.00018130874991584278, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.920493468787754e-05, 'clip_ratio/high_max': 6.466257036663592e-05, 'clip_ratio/region_mean': 0.00020051368073836784, 'epoch': 0.06}
+
+ 13%|█▎        | 138/1024 [10:11:07<66:45:03, 271.22s/it][AINFO 12-02 11:53:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:53:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:53:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:53:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▎        | 139/1024 [10:16:10<69:00:59, 280.74s/it][A
+                                                         [A{'loss': 0.0514, 'grad_norm': 0.001182848820462823, 'learning_rate': 1e-05, 'num_tokens': 48952401.0, 'completions/mean_length': 6020.6875, 'completions/min_length': 582.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 4540.21435546875, 'completions/min_terminated_length': 582.0, 'completions/max_terminated_length': 15838.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.38452720642089844, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016862154006958008, 'sampling/sampling_logp_difference/max': 1.8373703956604004, 'sampling/importance_sampling_ratio/min': 0.1592356115579605, 'sampling/importance_sampling_ratio/mean': 1.0001232624053955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4535091556608677, 'clip_ratio/low_mean': 0.00020014116944366833, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9976848054502625e-05, 'clip_ratio/high_max': 8.617146158940159e-05, 'clip_ratio/region_mean': 0.00023011801931716036, 'epoch': 0.06}
+
+ 14%|█▎        | 139/1024 [10:16:10<69:00:59, 280.74s/it][AINFO 12-02 11:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:58:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▎        | 140/1024 [10:20:23<66:52:03, 272.31s/it][A
+                                                         [A{'loss': 0.056, 'grad_norm': 0.0022725528106093407, 'learning_rate': 1e-05, 'num_tokens': 49272067.0, 'completions/mean_length': 4849.03125, 'completions/min_length': 771.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4476.935546875, 'completions/min_terminated_length': 771.0, 'completions/max_terminated_length': 13951.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.4024401307106018, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014701185747981071, 'sampling/sampling_logp_difference/max': 2.4141993522644043, 'sampling/importance_sampling_ratio/min': 0.08943892270326614, 'sampling/importance_sampling_ratio/mean': 1.0000264644622803, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40223439782857895, 'clip_ratio/low_mean': 0.00019048374269914348, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.912780050541187e-05, 'clip_ratio/high_max': 0.00014737310675627668, 'clip_ratio/region_mean': 0.00022961154991207877, 'epoch': 0.06}
+
+ 14%|█▎        | 140/1024 [10:20:23<66:52:03, 272.31s/it][AINFO 12-02 12:02:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:02:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:02:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:02:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 141/1024 [10:24:58<66:59:05, 273.10s/it][A
+                                                         [A{'loss': 0.0209, 'grad_norm': 0.0012264687102288008, 'learning_rate': 1e-05, 'num_tokens': 49634431.0, 'completions/mean_length': 5515.0625, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5164.45166015625, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 15055.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.46875, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01622956432402134, 'sampling/sampling_logp_difference/max': 4.622800827026367, 'sampling/importance_sampling_ratio/min': 0.009825238958001137, 'sampling/importance_sampling_ratio/mean': 1.0000790357589722, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.416252925992012, 'clip_ratio/low_mean': 0.00019687762051034952, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6589497224449588e-05, 'clip_ratio/high_max': 0.00010635798889779835, 'clip_ratio/region_mean': 0.0002234671210317174, 'epoch': 0.06}
+
+ 14%|█▍        | 141/1024 [10:24:58<66:59:05, 273.10s/it][AINFO 12-02 12:07:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:07:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:07:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:07:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 142/1024 [10:30:04<69:22:56, 283.19s/it][A
+                                                         [A{'loss': -0.0335, 'grad_norm': 0.0009716386557556689, 'learning_rate': 1e-05, 'num_tokens': 50105870.0, 'completions/mean_length': 7238.359375, 'completions/min_length': 559.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 5931.83935546875, 'completions/min_terminated_length': 559.0, 'completions/max_terminated_length': 16233.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.3740273714065552, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014967331662774086, 'sampling/sampling_logp_difference/max': 2.3943240642547607, 'sampling/importance_sampling_ratio/min': 0.09123432636260986, 'sampling/importance_sampling_ratio/mean': 1.0000395774841309, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3966602236032486, 'clip_ratio/low_mean': 0.00027144614705321146, 'clip_ratio/low_min': 1.9210081518394873e-05, 'clip_ratio/high_mean': 4.678626066834113e-05, 'clip_ratio/high_max': 0.0001557968794259068, 'clip_ratio/region_mean': 0.0003182324035151396, 'epoch': 0.07}
+
+ 14%|█▍        | 142/1024 [10:30:04<69:22:56, 283.19s/it][AINFO 12-02 12:12:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:12:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:12:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:12:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 143/1024 [10:34:43<68:57:56, 281.81s/it][A
+                                                         [A{'loss': 0.0709, 'grad_norm': 0.002418189076706767, 'learning_rate': 1e-05, 'num_tokens': 50457722.0, 'completions/mean_length': 5373.0625, 'completions/min_length': 504.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4831.54052734375, 'completions/min_terminated_length': 504.0, 'completions/max_terminated_length': 15307.0, 'rewards/accuracy_reward/mean': 0.671875, 'rewards/accuracy_reward/std': 0.4732423722743988, 'reward': 0.671875, 'reward_std': 0.26196980476379395, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01585206389427185, 'sampling/sampling_logp_difference/max': 2.9977500438690186, 'sampling/importance_sampling_ratio/min': 0.04989921674132347, 'sampling/importance_sampling_ratio/mean': 0.9999256134033203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40711716189980507, 'clip_ratio/low_mean': 0.00019303639101053705, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.578200120406109e-05, 'clip_ratio/high_max': 0.00026043069738079794, 'clip_ratio/region_mean': 0.00026881839039560873, 'epoch': 0.07}
+
+ 14%|█▍        | 143/1024 [10:34:43<68:57:56, 281.81s/it][AINFO 12-02 12:16:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 144/1024 [10:39:02<67:13:14, 274.99s/it][A
+                                                         [A{'loss': 0.0318, 'grad_norm': 0.001620751922018826, 'learning_rate': 1e-05, 'num_tokens': 50815833.0, 'completions/mean_length': 5450.484375, 'completions/min_length': 972.0, 'completions/max_length': 16149.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5450.484375, 'completions/min_terminated_length': 972.0, 'completions/max_terminated_length': 16149.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.35824596881866455, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016062598675489426, 'sampling/sampling_logp_difference/max': 6.510285377502441, 'sampling/importance_sampling_ratio/min': 0.0014880551025271416, 'sampling/importance_sampling_ratio/mean': 1.0001866817474365, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43867697194218636, 'clip_ratio/low_mean': 0.00017302668879892735, 'clip_ratio/low_min': 3.9544447645312175e-05, 'clip_ratio/high_mean': 4.6009224433873896e-05, 'clip_ratio/high_max': 0.00014343776365421945, 'clip_ratio/region_mean': 0.0002190359145970433, 'epoch': 0.07}
+
+ 14%|█▍        | 144/1024 [10:39:02<67:13:14, 274.99s/it][AINFO 12-02 12:21:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:21:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:21:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:21:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 145/1024 [10:41:15<56:42:31, 232.25s/it][A
+                                                         [A{'loss': 0.0143, 'grad_norm': 0.0011067023733630776, 'learning_rate': 1e-05, 'num_tokens': 51068921.0, 'completions/mean_length': 3812.625, 'completions/min_length': 679.0, 'completions/max_length': 8617.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3812.625, 'completions/min_terminated_length': 679.0, 'completions/max_terminated_length': 8617.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.27564918994903564, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.012999343685805798, 'sampling/sampling_logp_difference/max': 0.9584388732910156, 'sampling/importance_sampling_ratio/min': 0.3834911286830902, 'sampling/importance_sampling_ratio/mean': 1.0001009702682495, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.346145398914814, 'clip_ratio/low_mean': 0.00016274654581138748, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3416892872774042e-05, 'clip_ratio/high_max': 9.366757149109617e-05, 'clip_ratio/region_mean': 0.00018616344141264562, 'epoch': 0.07}
+
+ 14%|█▍        | 145/1024 [10:41:15<56:42:31, 232.25s/it][AINFO 12-02 12:23:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:23:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:23:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:23:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 146/1024 [10:45:23<57:50:16, 237.15s/it][A
+                                                         [A{'loss': -0.0325, 'grad_norm': 0.0021463430020958185, 'learning_rate': 1e-05, 'num_tokens': 51342061.0, 'completions/mean_length': 4128.1875, 'completions/min_length': 867.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3933.651123046875, 'completions/min_terminated_length': 867.0, 'completions/max_terminated_length': 11869.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.23356689512729645, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018250087276101112, 'sampling/sampling_logp_difference/max': 2.691512107849121, 'sampling/importance_sampling_ratio/min': 0.06777837127447128, 'sampling/importance_sampling_ratio/mean': 0.9999420642852783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4753922149538994, 'clip_ratio/low_mean': 0.00011860240374517161, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9752456839560182e-05, 'clip_ratio/high_max': 0.00011900982735824073, 'clip_ratio/region_mean': 0.00014835485922048974, 'epoch': 0.07}
+
+ 14%|█▍        | 146/1024 [10:45:23<57:50:16, 237.15s/it][AINFO 12-02 12:27:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 147/1024 [10:49:49<59:52:48, 245.80s/it][A
+                                                         [A{'loss': 0.0066, 'grad_norm': 0.0021687964908778667, 'learning_rate': 1e-05, 'num_tokens': 51618065.0, 'completions/mean_length': 4156.6875, 'completions/min_length': 759.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 3762.258056640625, 'completions/min_terminated_length': 759.0, 'completions/max_terminated_length': 13949.0, 'rewards/accuracy_reward/mean': 0.75, 'rewards/accuracy_reward/std': 0.4364357888698578, 'reward': 0.75, 'reward_std': 0.38877052068710327, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.013988473452627659, 'sampling/sampling_logp_difference/max': 1.1137652397155762, 'sampling/importance_sampling_ratio/min': 0.3283204138278961, 'sampling/importance_sampling_ratio/mean': 0.9999649524688721, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.38607434555888176, 'clip_ratio/low_mean': 0.0001506910575699294, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.55807512228057e-05, 'clip_ratio/high_max': 0.00017155795012513408, 'clip_ratio/region_mean': 0.00020627180811061407, 'epoch': 0.07}
+
+ 14%|█▍        | 147/1024 [10:49:49<59:52:48, 245.80s/it][AINFO 12-02 12:31:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:31:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:31:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:31:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 14%|█▍        | 148/1024 [10:54:46<63:31:22, 261.05s/it][A
+                                                         [A{'loss': 0.084, 'grad_norm': 0.0012283200630918145, 'learning_rate': 1e-05, 'num_tokens': 51968629.0, 'completions/mean_length': 5339.5625, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4403.59326171875, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 15740.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.2961388826370239, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015119784511625767, 'sampling/sampling_logp_difference/max': 2.4099557399749756, 'sampling/importance_sampling_ratio/min': 0.0898192748427391, 'sampling/importance_sampling_ratio/mean': 1.000002145767212, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.391679085791111, 'clip_ratio/low_mean': 0.00012603288951140712, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.231338794445037e-05, 'clip_ratio/high_max': 8.925355177780148e-05, 'clip_ratio/region_mean': 0.00014834627882009954, 'epoch': 0.07}
+
+ 14%|█▍        | 148/1024 [10:54:46<63:31:22, 261.05s/it][AINFO 12-02 12:36:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:36:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:36:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:36:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 149/1024 [10:59:59<67:15:15, 276.70s/it][A
+                                                         [A{'loss': -0.0099, 'grad_norm': 0.00046323848073370755, 'learning_rate': 1e-05, 'num_tokens': 52473505.0, 'completions/mean_length': 7748.6875, 'completions/min_length': 539.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.140625, 'completions/mean_terminated_length': 6335.63623046875, 'completions/min_terminated_length': 539.0, 'completions/max_terminated_length': 16216.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.4375, 'reward_std': 0.1462520956993103, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021071333438158035, 'sampling/sampling_logp_difference/max': 12.920390129089355, 'sampling/importance_sampling_ratio/min': 2.4476305497955764e-06, 'sampling/importance_sampling_ratio/mean': 0.9999663233757019, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5589723959565163, 'clip_ratio/low_mean': 4.2980281250493135e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7013136104869773e-05, 'clip_ratio/high_max': 0.00010805254441947909, 'clip_ratio/region_mean': 6.999341735536291e-05, 'epoch': 0.07}
+
+ 15%|█▍        | 149/1024 [10:59:59<67:15:15, 276.70s/it][AINFO 12-02 12:42:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 150/1024 [11:04:55<68:34:25, 282.45s/it][A
+                                                         [A{'loss': 0.0445, 'grad_norm': 0.00246169650927186, 'learning_rate': 1e-05, 'num_tokens': 52832237.0, 'completions/mean_length': 5436.3125, 'completions/min_length': 640.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4897.9013671875, 'completions/min_terminated_length': 640.0, 'completions/max_terminated_length': 14793.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.40715816617012024, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017020856961607933, 'sampling/sampling_logp_difference/max': 1.699141502380371, 'sampling/importance_sampling_ratio/min': 0.18284042179584503, 'sampling/importance_sampling_ratio/mean': 1.0000033378601074, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4680928438901901, 'clip_ratio/low_mean': 0.00020821194902964635, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.167685109583545e-05, 'clip_ratio/high_max': 0.00020305007092247251, 'clip_ratio/region_mean': 0.00028988880148972385, 'epoch': 0.07}
+
+ 15%|█▍        | 150/1024 [11:04:55<68:34:25, 282.45s/it][AINFO 12-02 12:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 151/1024 [11:09:25<67:35:42, 278.74s/it][A
+                                                         [A{'loss': 0.0246, 'grad_norm': 0.0028162088710814714, 'learning_rate': 1e-05, 'num_tokens': 53159226.0, 'completions/mean_length': 4989.203125, 'completions/min_length': 242.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4808.33349609375, 'completions/min_terminated_length': 242.0, 'completions/max_terminated_length': 15675.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.4024401307106018, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015343603678047657, 'sampling/sampling_logp_difference/max': 1.6297738552093506, 'sampling/importance_sampling_ratio/min': 0.19597387313842773, 'sampling/importance_sampling_ratio/mean': 0.9998958110809326, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.47258124873042107, 'clip_ratio/low_mean': 0.0001526722771814093, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.65282015890989e-05, 'clip_ratio/high_max': 0.0002501321432646364, 'clip_ratio/region_mean': 0.00022920047285879264, 'epoch': 0.07}
+
+ 15%|█▍        | 151/1024 [11:09:25<67:35:42, 278.74s/it][AINFO 12-02 12:51:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:51:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:51:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:51:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 152/1024 [11:13:45<66:09:47, 273.15s/it][A
+                                                         [A{'loss': 0.1136, 'grad_norm': 0.0007656652014702559, 'learning_rate': 1e-05, 'num_tokens': 53537401.0, 'completions/mean_length': 5760.609375, 'completions/min_length': 661.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5052.3837890625, 'completions/min_terminated_length': 661.0, 'completions/max_terminated_length': 13124.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.2709311842918396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017615780234336853, 'sampling/sampling_logp_difference/max': 3.2812328338623047, 'sampling/importance_sampling_ratio/min': 0.03758189454674721, 'sampling/importance_sampling_ratio/mean': 1.000088095664978, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46132899448275566, 'clip_ratio/low_mean': 0.00013527670921575918, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.783625015785219e-05, 'clip_ratio/high_max': 0.0001292225251745549, 'clip_ratio/region_mean': 0.0001731129564177536, 'epoch': 0.07}
+
+ 15%|█▍        | 152/1024 [11:13:45<66:09:47, 273.15s/it][AINFO 12-02 12:55:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:55:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:55:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:55:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▍        | 153/1024 [11:17:59<64:40:19, 267.30s/it][A
+                                                         [A{'loss': 0.0038, 'grad_norm': 0.0025049711111932993, 'learning_rate': 1e-05, 'num_tokens': 53978800.0, 'completions/mean_length': 6762.609375, 'completions/min_length': 1181.0, 'completions/max_length': 15168.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6762.609375, 'completions/min_terminated_length': 1181.0, 'completions/max_terminated_length': 15168.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.2619796097278595, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01840978115797043, 'sampling/sampling_logp_difference/max': 1.490939736366272, 'sampling/importance_sampling_ratio/min': 0.22516095638275146, 'sampling/importance_sampling_ratio/mean': 1.0000197887420654, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.47886158153414726, 'clip_ratio/low_mean': 0.00027991298702545464, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.321846159247798e-05, 'clip_ratio/high_max': 7.608405394421425e-05, 'clip_ratio/region_mean': 0.00030313144816318527, 'epoch': 0.07}
+
+ 15%|█▍        | 153/1024 [11:17:59<64:40:19, 267.30s/it][AINFO 12-02 13:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:00:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:00:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 154/1024 [11:22:28<64:44:22, 267.89s/it][A
+                                                         [A{'loss': 0.0024, 'grad_norm': 0.0006821770220994949, 'learning_rate': 1e-05, 'num_tokens': 54280159.0, 'completions/mean_length': 4554.109375, 'completions/min_length': 457.0, 'completions/max_length': 15844.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4554.109375, 'completions/min_terminated_length': 457.0, 'completions/max_terminated_length': 15844.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018870409578084946, 'sampling/sampling_logp_difference/max': 9.369060516357422, 'sampling/importance_sampling_ratio/min': 8.532351057510823e-05, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5032233372330666, 'clip_ratio/low_mean': 8.140349200402852e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0195017441437813e-05, 'clip_ratio/high_max': 8.078006976575125e-05, 'clip_ratio/region_mean': 0.00010159851080970839, 'epoch': 0.07}
+
+ 15%|█▌        | 154/1024 [11:22:28<64:44:22, 267.89s/it][AINFO 12-02 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 155/1024 [11:26:38<63:22:14, 262.52s/it][A
+                                                         [A{'loss': -0.004, 'grad_norm': 0.0020080553367733955, 'learning_rate': 1e-05, 'num_tokens': 54628583.0, 'completions/mean_length': 5302.0, 'completions/min_length': 632.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5126.095703125, 'completions/min_terminated_length': 632.0, 'completions/max_terminated_length': 16059.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019575171172618866, 'sampling/sampling_logp_difference/max': 1.6480439901351929, 'sampling/importance_sampling_ratio/min': 0.3164248466491699, 'sampling/importance_sampling_ratio/mean': 1.0000717639923096, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48817237466573715, 'clip_ratio/low_mean': 0.00021365572729337146, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.539213483032654e-05, 'clip_ratio/high_max': 8.099403203232214e-05, 'clip_ratio/region_mean': 0.00023904785984996124, 'epoch': 0.07}
+
+ 15%|█▌        | 155/1024 [11:26:38<63:22:14, 262.52s/it][AINFO 12-02 13:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:08:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 156/1024 [11:30:58<63:07:12, 261.79s/it][A
+                                                         [A{'loss': 0.0566, 'grad_norm': 0.0024579386226832867, 'learning_rate': 1e-05, 'num_tokens': 54935543.0, 'completions/mean_length': 4650.25, 'completions/min_length': 846.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4271.74169921875, 'completions/min_terminated_length': 846.0, 'completions/max_terminated_length': 12970.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.16675157845020294, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.014533299021422863, 'sampling/sampling_logp_difference/max': 1.6576356887817383, 'sampling/importance_sampling_ratio/min': 0.1905890703201294, 'sampling/importance_sampling_ratio/mean': 0.9999713897705078, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3787198141217232, 'clip_ratio/low_mean': 8.073771277850028e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1118766337858688e-05, 'clip_ratio/high_max': 4.447506535143475e-05, 'clip_ratio/region_mean': 9.185648013954051e-05, 'epoch': 0.07}
+
+ 15%|█▌        | 156/1024 [11:30:58<63:07:12, 261.79s/it][AINFO 12-02 13:13:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:13:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:13:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:13:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 157/1024 [11:35:40<64:32:04, 267.96s/it][A
+                                                         [A{'loss': -0.0259, 'grad_norm': 0.0015238798223435879, 'learning_rate': 1e-05, 'num_tokens': 55279722.0, 'completions/mean_length': 5235.421875, 'completions/min_length': 317.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4687.130859375, 'completions/min_terminated_length': 317.0, 'completions/max_terminated_length': 16033.0, 'rewards/accuracy_reward/mean': 0.671875, 'rewards/accuracy_reward/std': 0.4732423722743988, 'reward': 0.671875, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015229538083076477, 'sampling/sampling_logp_difference/max': 11.186578750610352, 'sampling/importance_sampling_ratio/min': 1.3858958482160233e-05, 'sampling/importance_sampling_ratio/mean': 0.9999769330024719, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.39913783594965935, 'clip_ratio/low_mean': 0.00011961562449869234, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.395449119940167e-05, 'clip_ratio/high_max': 0.0001951979866134934, 'clip_ratio/region_mean': 0.0001935701138791046, 'epoch': 0.07}
+
+ 15%|█▌        | 157/1024 [11:35:40<64:32:04, 267.96s/it][AINFO 12-02 13:17:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:17:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 15%|█▌        | 158/1024 [11:40:10<64:33:09, 268.35s/it][A
+                                                         [A{'loss': 0.0942, 'grad_norm': 0.0017066102009266615, 'learning_rate': 1e-05, 'num_tokens': 55584695.0, 'completions/mean_length': 4606.453125, 'completions/min_length': 850.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4027.229248046875, 'completions/min_terminated_length': 850.0, 'completions/max_terminated_length': 13413.0, 'rewards/accuracy_reward/mean': 0.8125, 'rewards/accuracy_reward/std': 0.39339789748191833, 'reward': 0.8125, 'reward_std': 0.2756394147872925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01685759611427784, 'sampling/sampling_logp_difference/max': 1.7447569370269775, 'sampling/importance_sampling_ratio/min': 0.1746874451637268, 'sampling/importance_sampling_ratio/mean': 0.999865710735321, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4424309507012367, 'clip_ratio/low_mean': 0.00013953854931969545, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.680043770757038e-05, 'clip_ratio/high_max': 0.00021168462626519613, 'clip_ratio/region_mean': 0.00020633898566302378, 'epoch': 0.07}
+
+ 15%|█▌        | 158/1024 [11:40:10<64:33:09, 268.35s/it][AINFO 12-02 13:22:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:22:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:22:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:22:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 159/1024 [11:44:59<66:00:55, 274.75s/it][A
+                                                         [A{'loss': 0.0079, 'grad_norm': 0.0018331394530832767, 'learning_rate': 1e-05, 'num_tokens': 55902363.0, 'completions/mean_length': 4833.1875, 'completions/min_length': 786.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 3854.30517578125, 'completions/min_terminated_length': 786.0, 'completions/max_terminated_length': 16343.0, 'rewards/accuracy_reward/mean': 0.671875, 'rewards/accuracy_reward/std': 0.4732423722743988, 'reward': 0.671875, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01841835305094719, 'sampling/sampling_logp_difference/max': 2.012812614440918, 'sampling/importance_sampling_ratio/min': 0.13361233472824097, 'sampling/importance_sampling_ratio/mean': 0.9999761581420898, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4819244481623173, 'clip_ratio/low_mean': 8.928693659981946e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.646584750389593e-05, 'clip_ratio/high_max': 0.0001713420156193024, 'clip_ratio/region_mean': 0.00013575278182997863, 'epoch': 0.07}
+
+ 16%|█▌        | 159/1024 [11:44:59<66:00:55, 274.75s/it][AINFO 12-02 13:27:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 160/1024 [11:49:36<66:05:59, 275.42s/it][A
+                                                         [A{'loss': 0.0879, 'grad_norm': 0.001786656561307609, 'learning_rate': 1e-05, 'num_tokens': 56264045.0, 'completions/mean_length': 5516.40625, 'completions/min_length': 170.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5165.83837890625, 'completions/min_terminated_length': 170.0, 'completions/max_terminated_length': 15813.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.42081791162490845, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01607315056025982, 'sampling/sampling_logp_difference/max': 4.186697483062744, 'sampling/importance_sampling_ratio/min': 0.015196387656033039, 'sampling/importance_sampling_ratio/mean': 1.0000667572021484, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4045892246067524, 'clip_ratio/low_mean': 0.00027025709005101817, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.880156342958799e-05, 'clip_ratio/high_max': 0.00018580141659185756, 'clip_ratio/region_mean': 0.00033905864984262735, 'epoch': 0.07}
+
+ 16%|█▌        | 160/1024 [11:49:36<66:05:59, 275.42s/it][AINFO 12-02 13:31:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:31:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:31:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:31:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 161/1024 [11:54:39<68:00:33, 283.70s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.00038928701542317867, 'learning_rate': 1e-05, 'num_tokens': 56695839.0, 'completions/mean_length': 6590.03125, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5760.03369140625, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 16131.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.32195523381233215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016497332602739334, 'sampling/sampling_logp_difference/max': 8.531218528747559, 'sampling/importance_sampling_ratio/min': 0.00019721451099030674, 'sampling/importance_sampling_ratio/mean': 0.9999597668647766, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42640724033117294, 'clip_ratio/low_mean': 0.00022537845143233426, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.738305258593755e-05, 'clip_ratio/high_max': 0.00012893573511973955, 'clip_ratio/region_mean': 0.0002627614976518089, 'epoch': 0.07}
+
+ 16%|█▌        | 161/1024 [11:54:39<68:00:33, 283.70s/it][AINFO 12-02 13:36:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 162/1024 [11:59:07<66:48:13, 278.99s/it][A
+                                                         [A{'loss': 0.0208, 'grad_norm': 0.005779350642114878, 'learning_rate': 1e-05, 'num_tokens': 57003731.0, 'completions/mean_length': 4662.0625, 'completions/min_length': 279.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4085.573486328125, 'completions/min_terminated_length': 279.0, 'completions/max_terminated_length': 12927.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.3661493957042694, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014687095768749714, 'sampling/sampling_logp_difference/max': 1.3285677433013916, 'sampling/importance_sampling_ratio/min': 0.26485633850097656, 'sampling/importance_sampling_ratio/mean': 1.0000648498535156, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3794623836874962, 'clip_ratio/low_mean': 0.00019813395738310646, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.0077865239472885e-05, 'clip_ratio/high_max': 0.00016208443094001268, 'clip_ratio/region_mean': 0.0002482118306943448, 'epoch': 0.07}
+
+ 16%|█▌        | 162/1024 [11:59:07<66:48:13, 278.99s/it][AINFO 12-02 13:41:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:41:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:41:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:41:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 163/1024 [12:03:02<63:31:24, 265.60s/it][A
+                                                         [A{'loss': -0.0202, 'grad_norm': 0.001961949048563838, 'learning_rate': 1e-05, 'num_tokens': 57240461.0, 'completions/mean_length': 3416.90625, 'completions/min_length': 377.0, 'completions/max_length': 16330.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3416.90625, 'completions/min_terminated_length': 377.0, 'completions/max_terminated_length': 16330.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.40625, 'reward_std': 0.34929439425468445, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016173910349607468, 'sampling/sampling_logp_difference/max': 2.4066717624664307, 'sampling/importance_sampling_ratio/min': 0.09011472016572952, 'sampling/importance_sampling_ratio/mean': 0.9998192191123962, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42256590351462364, 'clip_ratio/low_mean': 0.0001744671271808329, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.727276857214747e-05, 'clip_ratio/high_max': 0.00018149921743315645, 'clip_ratio/region_mean': 0.00023173989575298037, 'epoch': 0.07}
+
+ 16%|█▌        | 163/1024 [12:03:02<63:31:24, 265.60s/it][AINFO 12-02 13:45:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:45:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:45:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:45:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 164/1024 [12:07:59<65:43:10, 275.11s/it][A
+                                                         [A{'loss': -0.0276, 'grad_norm': 0.0019788017962127924, 'learning_rate': 1e-05, 'num_tokens': 57603270.0, 'completions/mean_length': 5515.890625, 'completions/min_length': 412.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 4391.603515625, 'completions/min_terminated_length': 412.0, 'completions/max_terminated_length': 13851.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.25513991713523865, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01754840463399887, 'sampling/sampling_logp_difference/max': 9.854959487915039, 'sampling/importance_sampling_ratio/min': 5.248624074738473e-05, 'sampling/importance_sampling_ratio/mean': 0.9999870657920837, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4544442184269428, 'clip_ratio/low_mean': 0.00020246001668056124, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.114935205099755e-05, 'clip_ratio/high_max': 8.45974082039902e-05, 'clip_ratio/region_mean': 0.00022360936645782203, 'epoch': 0.08}
+
+ 16%|█▌        | 164/1024 [12:07:59<65:43:10, 275.11s/it][AINFO 12-02 13:50:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 165/1024 [12:12:22<64:45:29, 271.40s/it][A
+                                                         [A{'loss': 0.0225, 'grad_norm': 0.0029603042639791965, 'learning_rate': 1e-05, 'num_tokens': 57925326.0, 'completions/mean_length': 4912.875, 'completions/min_length': 560.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4542.83837890625, 'completions/min_terminated_length': 560.0, 'completions/max_terminated_length': 13856.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.3855752944946289, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014563363045454025, 'sampling/sampling_logp_difference/max': 1.1492018699645996, 'sampling/importance_sampling_ratio/min': 0.3253903388977051, 'sampling/importance_sampling_ratio/mean': 0.9999769926071167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3865491822361946, 'clip_ratio/low_mean': 0.0002029768065767712, 'clip_ratio/low_min': 1.8269511201651767e-05, 'clip_ratio/high_mean': 6.299860660874401e-05, 'clip_ratio/high_max': 0.00017297254635195713, 'clip_ratio/region_mean': 0.00026597541364026256, 'epoch': 0.08}
+
+ 16%|█▌        | 165/1024 [12:12:22<64:45:29, 271.40s/it][AINFO 12-02 13:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:54:27 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▌        | 166/1024 [12:15:20<58:02:26, 243.53s/it][A
+                                                         [A{'loss': 0.0025, 'grad_norm': 0.0018704166868701577, 'learning_rate': 1e-05, 'num_tokens': 58160401.0, 'completions/mean_length': 3496.296875, 'completions/min_length': 362.0, 'completions/max_length': 11657.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3496.296875, 'completions/min_terminated_length': 362.0, 'completions/max_terminated_length': 11657.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5039526224136353, 'reward': 0.5, 'reward_std': 0.38452720642089844, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017237184569239616, 'sampling/sampling_logp_difference/max': 1.9520455598831177, 'sampling/importance_sampling_ratio/min': 0.14198334515094757, 'sampling/importance_sampling_ratio/mean': 1.000113844871521, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.47157398238778114, 'clip_ratio/low_mean': 0.00026542034447629703, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.083443268929841e-05, 'clip_ratio/high_max': 8.333773075719364e-05, 'clip_ratio/region_mean': 0.00028625477352761663, 'epoch': 0.08}
+
+ 16%|█▌        | 166/1024 [12:15:20<58:02:26, 243.53s/it][AINFO 12-02 13:57:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:57:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:57:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:57:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▋        | 167/1024 [12:19:27<58:14:21, 244.65s/it][A
+                                                         [A{'loss': 0.1756, 'grad_norm': 0.0012130150571465492, 'learning_rate': 1e-05, 'num_tokens': 58436357.0, 'completions/mean_length': 4094.6875, 'completions/min_length': 490.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 3698.258056640625, 'completions/min_terminated_length': 490.0, 'completions/max_terminated_length': 12389.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.375, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015365684404969215, 'sampling/sampling_logp_difference/max': 1.2257864475250244, 'sampling/importance_sampling_ratio/min': 0.2935267686843872, 'sampling/importance_sampling_ratio/mean': 1.0001065731048584, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42410871759057045, 'clip_ratio/low_mean': 0.00016190460519283079, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.068586599714763e-05, 'clip_ratio/high_max': 0.00014508560070680687, 'clip_ratio/region_mean': 0.0002025904723268468, 'epoch': 0.08}
+
+ 16%|█▋        | 167/1024 [12:19:27<58:14:21, 244.65s/it][AINFO 12-02 14:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:01:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 16%|█▋        | 168/1024 [12:24:15<61:15:34, 257.63s/it][A
+                                                         [A{'loss': 0.1001, 'grad_norm': 0.00268056383356452, 'learning_rate': 1e-05, 'num_tokens': 58765763.0, 'completions/mean_length': 4968.21875, 'completions/min_length': 785.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4000.77978515625, 'completions/min_terminated_length': 785.0, 'completions/max_terminated_length': 13865.0, 'rewards/accuracy_reward/mean': 0.765625, 'rewards/accuracy_reward/std': 0.42695629596710205, 'reward': 0.765625, 'reward_std': 0.28460076451301575, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015506323426961899, 'sampling/sampling_logp_difference/max': 1.909333348274231, 'sampling/importance_sampling_ratio/min': 0.14817914366722107, 'sampling/importance_sampling_ratio/mean': 0.9999540448188782, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40487589687108994, 'clip_ratio/low_mean': 0.00016056902313721366, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.312859666948498e-05, 'clip_ratio/high_max': 0.0001325143866779399, 'clip_ratio/region_mean': 0.00019369762139831437, 'epoch': 0.08}
+
+ 16%|█▋        | 168/1024 [12:24:15<61:15:34, 257.63s/it][AINFO 12-02 14:06:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:06:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:06:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:06:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 169/1024 [12:29:46<66:24:42, 279.63s/it][A
+                                                         [A{'loss': 0.0864, 'grad_norm': 0.0018393347272649407, 'learning_rate': 1e-05, 'num_tokens': 59282983.0, 'completions/mean_length': 7930.6875, 'completions/min_length': 1098.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 6892.5615234375, 'completions/min_terminated_length': 1098.0, 'completions/max_terminated_length': 16130.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.421875, 'reward_std': 0.31512534618377686, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019846247509121895, 'sampling/sampling_logp_difference/max': 12.455373764038086, 'sampling/importance_sampling_ratio/min': 3.896726411767304e-06, 'sampling/importance_sampling_ratio/mean': 1.0000158548355103, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5562672168016434, 'clip_ratio/low_mean': 0.00023584384871355724, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.62909726795624e-05, 'clip_ratio/high_max': 0.000156758496814291, 'clip_ratio/region_mean': 0.0002921348275322089, 'epoch': 0.08}
+
+ 17%|█▋        | 169/1024 [12:29:46<66:24:42, 279.63s/it][AINFO 12-02 14:11:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:11:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:11:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:11:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 170/1024 [12:34:06<64:54:39, 273.63s/it][A
+                                                         [A{'loss': 0.0153, 'grad_norm': 0.005675601772964001, 'learning_rate': 1e-05, 'num_tokens': 59580750.0, 'completions/mean_length': 4504.609375, 'completions/min_length': 157.0, 'completions/max_length': 15727.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4504.609375, 'completions/min_terminated_length': 157.0, 'completions/max_terminated_length': 15727.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.31512534618377686, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017390597611665726, 'sampling/sampling_logp_difference/max': 2.880298376083374, 'sampling/importance_sampling_ratio/min': 0.05611801892518997, 'sampling/importance_sampling_ratio/mean': 1.0000591278076172, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.52762096747756, 'clip_ratio/low_mean': 5.3169938382779947e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8253825121282716e-05, 'clip_ratio/high_max': 0.00010225613459624583, 'clip_ratio/region_mean': 8.142376282194164e-05, 'epoch': 0.08}
+
+ 17%|█▋        | 170/1024 [12:34:06<64:54:39, 273.63s/it][AINFO 12-02 14:16:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:16:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:16:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:16:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 171/1024 [12:38:49<65:29:48, 276.42s/it][A
+                                                         [A{'loss': 0.0864, 'grad_norm': 0.0017957915551960468, 'learning_rate': 1e-05, 'num_tokens': 59961856.0, 'completions/mean_length': 5784.90625, 'completions/min_length': 412.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5443.0, 'completions/min_terminated_length': 412.0, 'completions/max_terminated_length': 15769.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.28883427381515503, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01577308028936386, 'sampling/sampling_logp_difference/max': 3.988126277923584, 'sampling/importance_sampling_ratio/min': 0.018534410744905472, 'sampling/importance_sampling_ratio/mean': 0.9999531507492065, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43236587196588516, 'clip_ratio/low_mean': 0.0001562846778142557, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.4869884732179344e-05, 'clip_ratio/high_max': 0.00014009990445629228, 'clip_ratio/region_mean': 0.0002011545602726983, 'epoch': 0.08}
+
+ 17%|█▋        | 171/1024 [12:38:49<65:29:48, 276.42s/it][AINFO 12-02 14:20:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 172/1024 [12:43:52<67:19:19, 284.46s/it][A
+                                                         [A{'loss': 0.0297, 'grad_norm': 0.003165139351040125, 'learning_rate': 1e-05, 'num_tokens': 60454254.0, 'completions/mean_length': 7542.84375, 'completions/min_length': 460.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6793.59326171875, 'completions/min_terminated_length': 460.0, 'completions/max_terminated_length': 15975.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.34352827072143555, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.02001876011490822, 'sampling/sampling_logp_difference/max': 2.7591142654418945, 'sampling/importance_sampling_ratio/min': 0.06334784626960754, 'sampling/importance_sampling_ratio/mean': 0.9998705387115479, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5352175273001194, 'clip_ratio/low_mean': 0.00022808159610576695, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.030399910945562e-05, 'clip_ratio/high_max': 0.00021212153205851791, 'clip_ratio/region_mean': 0.0002983855974889593, 'epoch': 0.08}
+
+ 17%|█▋        | 172/1024 [12:43:52<67:19:19, 284.46s/it][AINFO 12-02 14:25:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:25:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:25:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:25:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 173/1024 [12:48:57<68:39:42, 290.46s/it][A
+                                                         [A{'loss': 0.1112, 'grad_norm': 0.0031022382900118828, 'learning_rate': 1e-05, 'num_tokens': 60825621.0, 'completions/mean_length': 5653.734375, 'completions/min_length': 549.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4744.3896484375, 'completions/min_terminated_length': 549.0, 'completions/max_terminated_length': 16116.0, 'rewards/accuracy_reward/mean': 0.6875, 'rewards/accuracy_reward/std': 0.467176616191864, 'reward': 0.6875, 'reward_std': 0.32195523381233215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0169343464076519, 'sampling/sampling_logp_difference/max': 2.55395245552063, 'sampling/importance_sampling_ratio/min': 0.07777366042137146, 'sampling/importance_sampling_ratio/mean': 1.000047206878662, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4163411669433117, 'clip_ratio/low_mean': 9.701973590381385e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.5235827201395296e-05, 'clip_ratio/high_max': 0.00016802474783617072, 'clip_ratio/region_mean': 0.00014225556469682488, 'epoch': 0.08}
+
+ 17%|█▋        | 173/1024 [12:48:57<68:39:42, 290.46s/it][AINFO 12-02 14:31:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:31:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:31:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:31:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 174/1024 [12:53:57<69:17:11, 293.45s/it][A
+                                                         [A{'loss': 0.0124, 'grad_norm': 0.0027011719066649675, 'learning_rate': 1e-05, 'num_tokens': 61256147.0, 'completions/mean_length': 6580.09375, 'completions/min_length': 586.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 5376.10546875, 'completions/min_terminated_length': 586.0, 'completions/max_terminated_length': 16141.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.2109457403421402, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0187239907681942, 'sampling/sampling_logp_difference/max': 2.253652572631836, 'sampling/importance_sampling_ratio/min': 0.10501494258642197, 'sampling/importance_sampling_ratio/mean': 1.0001040697097778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5355764329433441, 'clip_ratio/low_mean': 0.00013417376794677693, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.553416420094436e-05, 'clip_ratio/high_max': 0.0001506949593022, 'clip_ratio/region_mean': 0.00017970793419408437, 'epoch': 0.08}
+
+ 17%|█▋        | 174/1024 [12:53:57<69:17:11, 293.45s/it][AINFO 12-02 14:36:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:36:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:36:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:36:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 175/1024 [12:58:45<68:47:43, 291.71s/it][A
+                                                         [A{'loss': 0.0641, 'grad_norm': 0.0010618126252666116, 'learning_rate': 1e-05, 'num_tokens': 61677511.0, 'completions/mean_length': 6447.0625, 'completions/min_length': 777.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 5419.103515625, 'completions/min_terminated_length': 777.0, 'completions/max_terminated_length': 15875.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.2867126166820526, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019278470426797867, 'sampling/sampling_logp_difference/max': 1.2334816455841064, 'sampling/importance_sampling_ratio/min': 0.2912766933441162, 'sampling/importance_sampling_ratio/mean': 0.9998835325241089, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46078241989016533, 'clip_ratio/low_mean': 0.00018547346917330287, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6506182166485814e-05, 'clip_ratio/high_max': 0.00010602472866594326, 'clip_ratio/region_mean': 0.00021197965179453604, 'epoch': 0.08}
+
+ 17%|█▋        | 175/1024 [12:58:45<68:47:43, 291.71s/it][AINFO 12-02 14:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 176/1024 [13:03:42<69:06:10, 293.36s/it][A
+                                                         [A{'loss': 0.1329, 'grad_norm': 0.0017744052456691861, 'learning_rate': 1e-05, 'num_tokens': 62061908.0, 'completions/mean_length': 5836.578125, 'completions/min_length': 663.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.140625, 'completions/mean_terminated_length': 4110.63623046875, 'completions/min_terminated_length': 663.0, 'completions/max_terminated_length': 13388.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.4187060594558716, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015235263854265213, 'sampling/sampling_logp_difference/max': 2.413378953933716, 'sampling/importance_sampling_ratio/min': 0.08951232582330704, 'sampling/importance_sampling_ratio/mean': 1.0001177787780762, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.37975843995809555, 'clip_ratio/low_mean': 0.00024174508507712744, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.822044184038532e-05, 'clip_ratio/high_max': 0.00014633266437158454, 'clip_ratio/region_mean': 0.0002899655282817548, 'epoch': 0.08}
+
+ 17%|█▋        | 176/1024 [13:03:42<69:06:10, 293.36s/it][AINFO 12-02 14:45:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:45:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:45:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:45:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 177/1024 [13:07:26<64:07:30, 272.55s/it][A
+                                                         [A{'loss': -0.0155, 'grad_norm': 0.0011726372176781297, 'learning_rate': 1e-05, 'num_tokens': 62333347.0, 'completions/mean_length': 4099.234375, 'completions/min_length': 1165.0, 'completions/max_length': 14475.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4099.234375, 'completions/min_terminated_length': 1165.0, 'completions/max_terminated_length': 14475.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.2893187999725342, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.014661148190498352, 'sampling/sampling_logp_difference/max': 3.583422899246216, 'sampling/importance_sampling_ratio/min': 0.027780447155237198, 'sampling/importance_sampling_ratio/mean': 1.000042200088501, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5022605285048485, 'clip_ratio/low_mean': 4.295969642953423e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.396024163113907e-05, 'clip_ratio/high_max': 5.584096652455628e-05, 'clip_ratio/region_mean': 5.69199380606733e-05, 'epoch': 0.08}
+
+ 17%|█▋        | 177/1024 [13:07:26<64:07:30, 272.55s/it][AINFO 12-02 14:49:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 178/1024 [13:11:38<62:37:41, 266.50s/it][A
+                                                         [A{'loss': 0.0526, 'grad_norm': 0.0013127598213031888, 'learning_rate': 1e-05, 'num_tokens': 62720098.0, 'completions/mean_length': 5880.359375, 'completions/min_length': 966.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5713.63525390625, 'completions/min_terminated_length': 966.0, 'completions/max_terminated_length': 13780.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.46875, 'reward_std': 0.30038219690322876, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01809724047780037, 'sampling/sampling_logp_difference/max': 1.9126999378204346, 'sampling/importance_sampling_ratio/min': 0.1476811170578003, 'sampling/importance_sampling_ratio/mean': 1.0001051425933838, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5196768753230572, 'clip_ratio/low_mean': 0.00018922266372101149, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.145587345443346e-05, 'clip_ratio/high_max': 0.0001044107525558502, 'clip_ratio/region_mean': 0.0002206785388807475, 'epoch': 0.08}
+
+ 17%|█▋        | 178/1024 [13:11:38<62:37:41, 266.50s/it][AINFO 12-02 14:53:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:53:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:53:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:53:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 17%|█▋        | 179/1024 [13:16:39<64:57:27, 276.74s/it][A
+                                                         [A{'loss': 0.0803, 'grad_norm': 0.001370625221170485, 'learning_rate': 1e-05, 'num_tokens': 63053797.0, 'completions/mean_length': 5037.296875, 'completions/min_length': 605.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4479.26220703125, 'completions/min_terminated_length': 605.0, 'completions/max_terminated_length': 14040.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.46875, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01526111364364624, 'sampling/sampling_logp_difference/max': 2.756603240966797, 'sampling/importance_sampling_ratio/min': 0.06350711733102798, 'sampling/importance_sampling_ratio/mean': 0.9999675750732422, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42641017213463783, 'clip_ratio/low_mean': 0.00011718644213942753, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.004296351922676e-05, 'clip_ratio/high_max': 0.00014467854634858668, 'clip_ratio/region_mean': 0.00015722940247542283, 'epoch': 0.08}
+
+ 17%|█▋        | 179/1024 [13:16:39<64:57:27, 276.74s/it][AINFO 12-02 14:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:58:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:58:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 180/1024 [13:20:35<62:01:24, 264.55s/it][A
+                                                         [A{'loss': 0.0786, 'grad_norm': 0.002294801641255617, 'learning_rate': 1e-05, 'num_tokens': 63344599.0, 'completions/mean_length': 4394.53125, 'completions/min_length': 759.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4204.22265625, 'completions/min_terminated_length': 759.0, 'completions/max_terminated_length': 13448.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.400318443775177, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014346064999699593, 'sampling/sampling_logp_difference/max': 1.2036970853805542, 'sampling/importance_sampling_ratio/min': 0.3000827431678772, 'sampling/importance_sampling_ratio/mean': 1.000016450881958, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3875620365142822, 'clip_ratio/low_mean': 0.00014431385261559626, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.787059512840642e-05, 'clip_ratio/high_max': 0.00018301938052900368, 'clip_ratio/region_mean': 0.00020218444797137636, 'epoch': 0.08}
+
+ 18%|█▊        | 180/1024 [13:20:35<62:01:24, 264.55s/it][AINFO 12-02 15:02:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:02:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 181/1024 [13:25:14<62:58:01, 268.90s/it][A
+                                                         [A{'loss': 0.0527, 'grad_norm': 0.0019423601916059852, 'learning_rate': 1e-05, 'num_tokens': 63728066.0, 'completions/mean_length': 5845.171875, 'completions/min_length': 342.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5505.20947265625, 'completions/min_terminated_length': 342.0, 'completions/max_terminated_length': 15618.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.36978405714035034, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01772642694413662, 'sampling/sampling_logp_difference/max': 1.9703752994537354, 'sampling/importance_sampling_ratio/min': 0.1394045203924179, 'sampling/importance_sampling_ratio/mean': 0.9998786449432373, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5000931918621063, 'clip_ratio/low_mean': 0.00021972163449390791, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.8031233291112585e-05, 'clip_ratio/high_max': 0.0001767508074408397, 'clip_ratio/region_mean': 0.00026775287187774666, 'epoch': 0.08}
+
+ 18%|█▊        | 181/1024 [13:25:14<62:58:01, 268.90s/it][AINFO 12-02 15:07:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 182/1024 [13:30:46<67:18:46, 287.80s/it][A
+                                                         [A{'loss': 0.0099, 'grad_norm': 0.001543469843454659, 'learning_rate': 1e-05, 'num_tokens': 64215515.0, 'completions/mean_length': 7359.390625, 'completions/min_length': 568.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6923.91748046875, 'completions/min_terminated_length': 568.0, 'completions/max_terminated_length': 15078.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.4364357888698578, 'reward': 0.25, 'reward_std': 0.13363061845302582, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.02153279073536396, 'sampling/sampling_logp_difference/max': 17.571203231811523, 'sampling/importance_sampling_ratio/min': 2.33842456509592e-08, 'sampling/importance_sampling_ratio/mean': 1.0000611543655396, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6715248450636864, 'clip_ratio/low_mean': 7.404642474284628e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.055521056514408e-05, 'clip_ratio/high_max': 4.222084226057632e-05, 'clip_ratio/region_mean': 8.460163553536404e-05, 'epoch': 0.08}
+
+ 18%|█▊        | 182/1024 [13:30:46<67:18:46, 287.80s/it][AINFO 12-02 15:12:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:12:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:12:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:12:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 183/1024 [13:35:37<67:28:00, 288.80s/it][A
+                                                         [A{'loss': 0.0217, 'grad_norm': 0.0019465356599539518, 'learning_rate': 1e-05, 'num_tokens': 64656253.0, 'completions/mean_length': 6702.90625, 'completions/min_length': 492.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 5701.41357421875, 'completions/min_terminated_length': 492.0, 'completions/max_terminated_length': 15401.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.35141608119010925, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016508109867572784, 'sampling/sampling_logp_difference/max': 2.291926145553589, 'sampling/importance_sampling_ratio/min': 0.10107159614562988, 'sampling/importance_sampling_ratio/mean': 1.000012755393982, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43159858137369156, 'clip_ratio/low_mean': 0.0001732713426463306, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.449729422300152e-05, 'clip_ratio/high_max': 8.670109718877939e-05, 'clip_ratio/region_mean': 0.00020776863766513998, 'epoch': 0.08}
+
+ 18%|█▊        | 183/1024 [13:35:37<67:28:00, 288.80s/it][AINFO 12-02 15:17:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:17:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:17:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:17:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 184/1024 [13:39:01<61:28:40, 263.48s/it][A
+                                                         [A{'loss': -0.0249, 'grad_norm': 0.003192627802491188, 'learning_rate': 1e-05, 'num_tokens': 64905591.0, 'completions/mean_length': 3763.28125, 'completions/min_length': 739.0, 'completions/max_length': 13117.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3763.28125, 'completions/min_terminated_length': 739.0, 'completions/max_terminated_length': 13117.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.36507582664489746, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0169939287006855, 'sampling/sampling_logp_difference/max': 1.1681791543960571, 'sampling/importance_sampling_ratio/min': 0.31093257665634155, 'sampling/importance_sampling_ratio/mean': 0.9999332427978516, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5009258165955544, 'clip_ratio/low_mean': 0.00010554293794484693, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.429811181376863e-05, 'clip_ratio/high_max': 0.00011160565463796956, 'clip_ratio/region_mean': 0.00013984105044073658, 'epoch': 0.08}
+
+ 18%|█▊        | 184/1024 [13:39:01<61:28:40, 263.48s/it][AINFO 12-02 15:21:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 185/1024 [13:43:21<61:06:14, 262.19s/it][A
+                                                         [A{'loss': 0.0869, 'grad_norm': 0.004198663402348757, 'learning_rate': 1e-05, 'num_tokens': 65213289.0, 'completions/mean_length': 4615.40625, 'completions/min_length': 850.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4428.603515625, 'completions/min_terminated_length': 850.0, 'completions/max_terminated_length': 15849.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.28460073471069336, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018118925392627716, 'sampling/sampling_logp_difference/max': 2.5912604331970215, 'sampling/importance_sampling_ratio/min': 0.07492554187774658, 'sampling/importance_sampling_ratio/mean': 1.0000096559524536, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48525139316916466, 'clip_ratio/low_mean': 5.1187697863497306e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.308798598794965e-05, 'clip_ratio/high_max': 0.00028079108233214356, 'clip_ratio/region_mean': 0.00014427568476094166, 'epoch': 0.09}
+
+ 18%|█▊        | 185/1024 [13:43:21<61:06:14, 262.19s/it][AINFO 12-02 15:25:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:25:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:25:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:25:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 186/1024 [13:48:28<64:11:53, 275.79s/it][A
+                                                         [A{'loss': 0.062, 'grad_norm': 0.0023174865636974573, 'learning_rate': 1e-05, 'num_tokens': 65676128.0, 'completions/mean_length': 7058.234375, 'completions/min_length': 283.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 6093.5, 'completions/min_terminated_length': 283.0, 'completions/max_terminated_length': 16242.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.434487521648407, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020446844398975372, 'sampling/sampling_logp_difference/max': 8.572606086730957, 'sampling/importance_sampling_ratio/min': 0.00018921888840850443, 'sampling/importance_sampling_ratio/mean': 0.9999809265136719, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5066586621105671, 'clip_ratio/low_mean': 0.0002451282889524009, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.815201274752326e-05, 'clip_ratio/high_max': 0.000243305325966503, 'clip_ratio/region_mean': 0.00032328030101780314, 'epoch': 0.09}
+
+ 18%|█▊        | 186/1024 [13:48:28<64:11:53, 275.79s/it][AINFO 12-02 15:30:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 187/1024 [13:53:33<66:10:17, 284.61s/it][A
+                                                         [A{'loss': 0.046, 'grad_norm': 0.0019449747633188963, 'learning_rate': 1e-05, 'num_tokens': 66069227.0, 'completions/mean_length': 5976.171875, 'completions/min_length': 288.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5282.31689453125, 'completions/min_terminated_length': 288.0, 'completions/max_terminated_length': 15807.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.40625, 'reward_std': 0.3119301199913025, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02005024626851082, 'sampling/sampling_logp_difference/max': 1.7604626417160034, 'sampling/importance_sampling_ratio/min': 0.17196528613567352, 'sampling/importance_sampling_ratio/mean': 0.9999864101409912, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5640180520713329, 'clip_ratio/low_mean': 0.0002216410339315189, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9686305424547754e-05, 'clip_ratio/high_max': 8.482501834805589e-05, 'clip_ratio/region_mean': 0.00025132734299404547, 'epoch': 0.09}
+
+ 18%|█▊        | 187/1024 [13:53:33<66:10:17, 284.61s/it][AINFO 12-02 15:35:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:35:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:35:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:35:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 188/1024 [13:58:18<66:07:11, 284.73s/it][A
+                                                         [A{'loss': 0.1261, 'grad_norm': 0.003890430787578225, 'learning_rate': 1e-05, 'num_tokens': 66352927.0, 'completions/mean_length': 4294.5625, 'completions/min_length': 836.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 3699.999755859375, 'completions/min_terminated_length': 836.0, 'completions/max_terminated_length': 15511.0, 'rewards/accuracy_reward/mean': 0.75, 'rewards/accuracy_reward/std': 0.4364357888698578, 'reward': 0.75, 'reward_std': 0.3014557659626007, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.012602383270859718, 'sampling/sampling_logp_difference/max': 1.8336169719696045, 'sampling/importance_sampling_ratio/min': 0.15983441472053528, 'sampling/importance_sampling_ratio/mean': 1.0000393390655518, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.33075474947690964, 'clip_ratio/low_mean': 0.00014878600768497563, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.9515091379580554e-05, 'clip_ratio/high_max': 0.0001574464149598498, 'clip_ratio/region_mean': 0.00020830109860980883, 'epoch': 0.09}
+
+ 18%|█▊        | 188/1024 [13:58:18<66:07:11, 284.73s/it][AINFO 12-02 15:40:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:40:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:40:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:40:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 18%|█▊        | 189/1024 [14:01:52<61:05:00, 263.35s/it][A
+                                                         [A{'loss': -0.0131, 'grad_norm': 0.0012454156531021, 'learning_rate': 1e-05, 'num_tokens': 66593778.0, 'completions/mean_length': 3622.546875, 'completions/min_length': 610.0, 'completions/max_length': 13754.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3622.546875, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 13754.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014155917800962925, 'sampling/sampling_logp_difference/max': 2.931702136993408, 'sampling/importance_sampling_ratio/min': 0.053306225687265396, 'sampling/importance_sampling_ratio/mean': 0.9999906420707703, 'sampling/importance_sampling_ratio/max': 1.868618369102478, 'entropy': 0.38039437495172024, 'clip_ratio/low_mean': 0.0001413008285453543, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.425679379797657e-05, 'clip_ratio/high_max': 0.00023468059771403205, 'clip_ratio/region_mean': 0.00022555762689080439, 'epoch': 0.09}
+
+ 18%|█▊        | 189/1024 [14:01:52<61:05:00, 263.35s/it][AINFO 12-02 15:43:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:43:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:43:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:43:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▊        | 190/1024 [14:05:18<57:01:59, 246.19s/it][A
+                                                         [A{'loss': 0.0851, 'grad_norm': 0.00262165698222816, 'learning_rate': 1e-05, 'num_tokens': 66910284.0, 'completions/mean_length': 4807.03125, 'completions/min_length': 342.0, 'completions/max_length': 13975.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4807.03125, 'completions/min_terminated_length': 342.0, 'completions/max_terminated_length': 13975.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.41398805379867554, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01738087832927704, 'sampling/sampling_logp_difference/max': 1.5983357429504395, 'sampling/importance_sampling_ratio/min': 0.20223280787467957, 'sampling/importance_sampling_ratio/mean': 0.9998066425323486, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4457588791847229, 'clip_ratio/low_mean': 0.00019064946809521643, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.8886223087029066e-05, 'clip_ratio/high_max': 0.00019812876416835934, 'clip_ratio/region_mean': 0.0002495356930012349, 'epoch': 0.09}
+
+ 19%|█▊        | 190/1024 [14:05:18<57:01:59, 246.19s/it][AINFO 12-02 15:47:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:47:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:47:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:47:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▊        | 191/1024 [14:09:46<58:28:56, 252.75s/it][A
+                                                         [A{'loss': 0.0427, 'grad_norm': 0.0025396535638719797, 'learning_rate': 1e-05, 'num_tokens': 67303328.0, 'completions/mean_length': 5993.8125, 'completions/min_length': 580.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5301.1337890625, 'completions/min_terminated_length': 580.0, 'completions/max_terminated_length': 14832.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.41610968112945557, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017853666096925735, 'sampling/sampling_logp_difference/max': 13.86157512664795, 'sampling/importance_sampling_ratio/min': 9.54980350797996e-07, 'sampling/importance_sampling_ratio/mean': 0.9999922513961792, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4071556143462658, 'clip_ratio/low_mean': 0.0003225664290766872, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.546240956595284e-05, 'clip_ratio/high_max': 0.00018984233793162275, 'clip_ratio/region_mean': 0.00038802883864264004, 'epoch': 0.09}
+
+ 19%|█▊        | 191/1024 [14:09:46<58:28:56, 252.75s/it][AINFO 12-02 15:51:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:51:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:51:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:51:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 192/1024 [14:14:08<59:01:46, 255.42s/it][A
+                                                         [A{'loss': 0.0399, 'grad_norm': 0.005749806761741638, 'learning_rate': 1e-05, 'num_tokens': 67593626.0, 'completions/mean_length': 4402.90625, 'completions/min_length': 514.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4212.73046875, 'completions/min_terminated_length': 514.0, 'completions/max_terminated_length': 13565.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.3913668990135193, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015034412033855915, 'sampling/sampling_logp_difference/max': 5.37518835067749, 'sampling/importance_sampling_ratio/min': 0.004630046430975199, 'sampling/importance_sampling_ratio/mean': 1.0001617670059204, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3910212889313698, 'clip_ratio/low_mean': 0.0003322051306895446, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.586918746303127e-05, 'clip_ratio/high_max': 0.00014319884849101072, 'clip_ratio/region_mean': 0.0003880743206536863, 'epoch': 0.09}
+
+ 19%|█▉        | 192/1024 [14:14:08<59:01:46, 255.42s/it][AINFO 12-02 15:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:56:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 193/1024 [14:18:11<58:08:50, 251.90s/it][A
+                                                         [A{'loss': 0.0187, 'grad_norm': 0.002849260577932, 'learning_rate': 1e-05, 'num_tokens': 67803777.0, 'completions/mean_length': 3140.734375, 'completions/min_length': 810.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 2930.52392578125, 'completions/min_terminated_length': 810.0, 'completions/max_terminated_length': 10404.0, 'rewards/accuracy_reward/mean': 0.84375, 'rewards/accuracy_reward/std': 0.36596253514289856, 'reward': 0.84375, 'reward_std': 0.23827511072158813, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.012824609875679016, 'sampling/sampling_logp_difference/max': 1.2435317039489746, 'sampling/importance_sampling_ratio/min': 0.2883639931678772, 'sampling/importance_sampling_ratio/mean': 1.0000371932983398, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3649657070636749, 'clip_ratio/low_mean': 0.00013347376761885243, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.714091267032927e-05, 'clip_ratio/high_max': 0.00018856365068131709, 'clip_ratio/region_mean': 0.00018061468108498957, 'epoch': 0.09}
+
+ 19%|█▉        | 193/1024 [14:18:11<58:08:50, 251.90s/it][AINFO 12-02 16:00:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:00:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:00:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:00:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 194/1024 [14:23:23<62:11:18, 269.73s/it][A
+                                                         [A{'loss': 0.0458, 'grad_norm': 0.001149773015640676, 'learning_rate': 1e-05, 'num_tokens': 68200279.0, 'completions/mean_length': 6050.21875, 'completions/min_length': 763.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5716.87060546875, 'completions/min_terminated_length': 763.0, 'completions/max_terminated_length': 16014.0, 'rewards/accuracy_reward/mean': 0.765625, 'rewards/accuracy_reward/std': 0.42695629596710205, 'reward': 0.765625, 'reward_std': 0.16887323558330536, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01733572408556938, 'sampling/sampling_logp_difference/max': 1.4956789016723633, 'sampling/importance_sampling_ratio/min': 0.224096417427063, 'sampling/importance_sampling_ratio/mean': 1.0000070333480835, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5111652053892612, 'clip_ratio/low_mean': 0.0001060730246535968, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.11978815261682e-05, 'clip_ratio/high_max': 0.00011536183410498779, 'clip_ratio/region_mean': 0.00013727090754400706, 'epoch': 0.09}
+
+ 19%|█▉        | 194/1024 [14:23:23<62:11:18, 269.73s/it][AINFO 12-02 16:05:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:05:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:05:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:05:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 195/1024 [14:28:29<64:36:20, 280.55s/it][A
+                                                         [A{'loss': 0.1214, 'grad_norm': 0.001131602330133319, 'learning_rate': 1e-05, 'num_tokens': 68557965.0, 'completions/mean_length': 5421.59375, 'completions/min_length': 947.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5247.58740234375, 'completions/min_terminated_length': 947.0, 'completions/max_terminated_length': 15663.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.3571978509426117, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.0152858542278409, 'sampling/sampling_logp_difference/max': 5.221958637237549, 'sampling/importance_sampling_ratio/min': 0.005396748427301645, 'sampling/importance_sampling_ratio/mean': 0.9999902248382568, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.38775429874658585, 'clip_ratio/low_mean': 0.00023116412103263428, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.361560879966419e-05, 'clip_ratio/high_max': 0.00011749103850888787, 'clip_ratio/region_mean': 0.00026477973005967215, 'epoch': 0.09}
+
+ 19%|█▉        | 195/1024 [14:28:29<64:36:20, 280.55s/it][AINFO 12-02 16:10:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:10:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:10:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:10:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 196/1024 [14:32:26<61:33:25, 267.64s/it][A
+                                                         [A{'loss': 0.0354, 'grad_norm': 0.0018495054682716727, 'learning_rate': 1e-05, 'num_tokens': 69002729.0, 'completions/mean_length': 6801.5625, 'completions/min_length': 1357.0, 'completions/max_length': 13563.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6801.5625, 'completions/min_terminated_length': 1357.0, 'completions/max_terminated_length': 13563.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.546875, 'reward_std': 0.38664889335632324, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020176630467176437, 'sampling/sampling_logp_difference/max': 2.246924877166748, 'sampling/importance_sampling_ratio/min': 0.10572383552789688, 'sampling/importance_sampling_ratio/mean': 0.999967634677887, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6263939291238785, 'clip_ratio/low_mean': 0.000313361673761392, 'clip_ratio/low_min': 7.710813406447414e-06, 'clip_ratio/high_mean': 2.332010626560077e-05, 'clip_ratio/high_max': 7.546381038991967e-05, 'clip_ratio/region_mean': 0.0003366817818459822, 'epoch': 0.09}
+
+ 19%|█▉        | 196/1024 [14:32:26<61:33:25, 267.64s/it][AINFO 12-02 16:14:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:14:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:14:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:14:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 197/1024 [14:36:56<61:38:11, 268.31s/it][A
+                                                         [A{'loss': -0.1536, 'grad_norm': 0.0027144383639097214, 'learning_rate': 1e-05, 'num_tokens': 69280195.0, 'completions/mean_length': 4201.15625, 'completions/min_length': 689.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 3601.999755859375, 'completions/min_terminated_length': 689.0, 'completions/max_terminated_length': 13784.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.34929442405700684, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014583425596356392, 'sampling/sampling_logp_difference/max': 6.1873674392700195, 'sampling/importance_sampling_ratio/min': 0.0020552303176373243, 'sampling/importance_sampling_ratio/mean': 0.9998899698257446, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3891052044928074, 'clip_ratio/low_mean': 8.490890331813716e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0999698424238886e-05, 'clip_ratio/high_max': 0.00012399879369695554, 'clip_ratio/region_mean': 0.0001159086021971234, 'epoch': 0.09}
+
+ 19%|█▉        | 197/1024 [14:36:56<61:38:11, 268.31s/it][AINFO 12-02 16:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 198/1024 [14:39:34<53:57:51, 235.20s/it][A
+                                                         [A{'loss': 0.0321, 'grad_norm': 0.0010546933626756072, 'learning_rate': 1e-05, 'num_tokens': 69515036.0, 'completions/mean_length': 3524.265625, 'completions/min_length': 563.0, 'completions/max_length': 10326.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3524.265625, 'completions/min_terminated_length': 563.0, 'completions/max_terminated_length': 10326.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.2472364604473114, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.015132145024836063, 'sampling/sampling_logp_difference/max': 0.816753625869751, 'sampling/importance_sampling_ratio/min': 0.4418638050556183, 'sampling/importance_sampling_ratio/mean': 0.9999737739562988, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4574563056230545, 'clip_ratio/low_mean': 0.0001179930891339609, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.342183958920941e-05, 'clip_ratio/high_max': 0.00011457713208073983, 'clip_ratio/region_mean': 0.00015141492622205988, 'epoch': 0.09}
+
+ 19%|█▉        | 198/1024 [14:39:34<53:57:51, 235.20s/it][AINFO 12-02 16:21:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:21:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:21:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:21:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 19%|█▉        | 199/1024 [14:43:51<55:25:46, 241.87s/it][A
+                                                         [A{'loss': 0.0216, 'grad_norm': 0.003099511144682765, 'learning_rate': 1e-05, 'num_tokens': 69741747.0, 'completions/mean_length': 3392.984375, 'completions/min_length': 827.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3186.778076171875, 'completions/min_terminated_length': 827.0, 'completions/max_terminated_length': 13733.0, 'rewards/accuracy_reward/mean': 0.796875, 'rewards/accuracy_reward/std': 0.40550529956817627, 'reward': 0.796875, 'reward_std': 0.2561880350112915, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.012250704690814018, 'sampling/sampling_logp_difference/max': 1.3412132263183594, 'sampling/importance_sampling_ratio/min': 0.3545035719871521, 'sampling/importance_sampling_ratio/mean': 0.9999752640724182, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.32005265168845654, 'clip_ratio/low_mean': 9.889628790915594e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5278436623921152e-05, 'clip_ratio/high_max': 0.00010111374649568461, 'clip_ratio/region_mean': 0.0001241747245330771, 'epoch': 0.09}
+
+ 19%|█▉        | 199/1024 [14:43:51<55:25:46, 241.87s/it][AINFO 12-02 16:25:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:25:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:25:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:25:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 200/1024 [14:48:24<57:26:59, 250.99s/it][A
+                                                         [A{'loss': 0.0348, 'grad_norm': 0.01134851761162281, 'learning_rate': 1e-05, 'num_tokens': 70128314.0, 'completions/mean_length': 5887.859375, 'completions/min_length': 1060.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5721.25439453125, 'completions/min_terminated_length': 1060.0, 'completions/max_terminated_length': 16015.0, 'rewards/accuracy_reward/mean': 0.671875, 'rewards/accuracy_reward/std': 0.4732423722743988, 'reward': 0.671875, 'reward_std': 0.29355230927467346, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016527555882930756, 'sampling/sampling_logp_difference/max': 1.414884090423584, 'sampling/importance_sampling_ratio/min': 0.24295376241207123, 'sampling/importance_sampling_ratio/mean': 0.9999840259552002, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48501599580049515, 'clip_ratio/low_mean': 0.00021369762953327154, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9595545811389457e-05, 'clip_ratio/high_max': 9.428406247025123e-05, 'clip_ratio/region_mean': 0.00024329318057425553, 'epoch': 0.09}
+
+ 20%|█▉        | 200/1024 [14:48:24<57:26:59, 250.99s/it][AINFO 12-02 16:30:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 201/1024 [14:52:38<57:38:40, 252.15s/it][A
+                                                         [A{'loss': 0.0564, 'grad_norm': 0.0015120146563276649, 'learning_rate': 1e-05, 'num_tokens': 70451913.0, 'completions/mean_length': 4918.484375, 'completions/min_length': 420.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4736.4921875, 'completions/min_terminated_length': 420.0, 'completions/max_terminated_length': 15278.0, 'rewards/accuracy_reward/mean': 0.6875, 'rewards/accuracy_reward/std': 0.467176616191864, 'reward': 0.6875, 'reward_std': 0.35400262475013733, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016887318342924118, 'sampling/sampling_logp_difference/max': 1.9251396656036377, 'sampling/importance_sampling_ratio/min': 0.1458553820848465, 'sampling/importance_sampling_ratio/mean': 0.9999623894691467, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.48017290234565735, 'clip_ratio/low_mean': 0.0002474979714861547, 'clip_ratio/low_min': 1.8278862626175396e-05, 'clip_ratio/high_mean': 6.400648044291302e-05, 'clip_ratio/high_max': 0.00018024269047600683, 'clip_ratio/region_mean': 0.00031150445283856243, 'epoch': 0.09}
+
+ 20%|█▉        | 201/1024 [14:52:38<57:38:40, 252.15s/it][AINFO 12-02 16:34:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:34:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:34:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:34:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 202/1024 [14:56:30<56:11:32, 246.10s/it][A
+                                                         [A{'loss': 0.0527, 'grad_norm': 0.004667791537940502, 'learning_rate': 1e-05, 'num_tokens': 70732160.0, 'completions/mean_length': 4219.609375, 'completions/min_length': 254.0, 'completions/max_length': 13793.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4219.609375, 'completions/min_terminated_length': 254.0, 'completions/max_terminated_length': 13793.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.3424547016620636, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.015926141291856766, 'sampling/sampling_logp_difference/max': 1.7520068883895874, 'sampling/importance_sampling_ratio/min': 0.17342554032802582, 'sampling/importance_sampling_ratio/mean': 0.9998953342437744, 'sampling/importance_sampling_ratio/max': 1.9382649660110474, 'entropy': 0.572041105479002, 'clip_ratio/low_mean': 0.00025798506612773053, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.765578723388899e-05, 'clip_ratio/high_max': 0.00022520906804857077, 'clip_ratio/region_mean': 0.0003256408526794985, 'epoch': 0.09}
+
+ 20%|█▉        | 202/1024 [14:56:30<56:11:32, 246.10s/it][AINFO 12-02 16:38:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:38:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:38:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:38:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 203/1024 [15:01:11<58:30:47, 256.57s/it][A
+                                                         [A{'loss': -0.024, 'grad_norm': 0.0012569375103339553, 'learning_rate': 1e-05, 'num_tokens': 71090562.0, 'completions/mean_length': 5460.65625, 'completions/min_length': 430.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5287.27001953125, 'completions/min_terminated_length': 430.0, 'completions/max_terminated_length': 15883.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.16675157845020294, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.016552096232771873, 'sampling/sampling_logp_difference/max': 2.003696918487549, 'sampling/importance_sampling_ratio/min': 0.15485866367816925, 'sampling/importance_sampling_ratio/mean': 1.0001070499420166, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4207439385354519, 'clip_ratio/low_mean': 5.376843546400778e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.445198318440816e-06, 'clip_ratio/high_max': 3.3780793273763265e-05, 'clip_ratio/region_mean': 6.221363241820654e-05, 'epoch': 0.09}
+
+ 20%|█▉        | 203/1024 [15:01:11<58:30:47, 256.57s/it][AINFO 12-02 16:43:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:43:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:43:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:43:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 204/1024 [15:06:01<60:42:38, 266.53s/it][A
+                                                         [A{'loss': 0.1174, 'grad_norm': 0.00043164865928702056, 'learning_rate': 1e-05, 'num_tokens': 71511088.0, 'completions/mean_length': 6401.96875, 'completions/min_length': 522.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5556.03369140625, 'completions/min_terminated_length': 522.0, 'completions/max_terminated_length': 13878.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.3403330445289612, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016999561339616776, 'sampling/sampling_logp_difference/max': 2.4094834327697754, 'sampling/importance_sampling_ratio/min': 0.08986170589923859, 'sampling/importance_sampling_ratio/mean': 1.0001401901245117, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.495794378221035, 'clip_ratio/low_mean': 0.0002309140500074136, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.191974994682823e-05, 'clip_ratio/high_max': 0.00012056542891514255, 'clip_ratio/region_mean': 0.0002728337931330316, 'epoch': 0.09}
+
+ 20%|█▉        | 204/1024 [15:06:01<60:42:38, 266.53s/it][AINFO 12-02 16:48:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:48:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:48:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:48:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 205/1024 [15:10:09<59:19:50, 260.79s/it][A
+                                                         [A{'loss': 0.0272, 'grad_norm': 0.003600046504288912, 'learning_rate': 1e-05, 'num_tokens': 71797886.0, 'completions/mean_length': 4333.96875, 'completions/min_length': 197.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4142.69873046875, 'completions/min_terminated_length': 197.0, 'completions/max_terminated_length': 13251.0, 'rewards/accuracy_reward/mean': 0.78125, 'rewards/accuracy_reward/std': 0.4166666865348816, 'reward': 0.78125, 'reward_std': 0.24359199404716492, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017158811911940575, 'sampling/sampling_logp_difference/max': 1.7825268507003784, 'sampling/importance_sampling_ratio/min': 0.16821257770061493, 'sampling/importance_sampling_ratio/mean': 0.9999789595603943, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3832726515829563, 'clip_ratio/low_mean': 0.00021531851598410867, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.3013071717723506e-05, 'clip_ratio/high_max': 0.00017205228687089402, 'clip_ratio/region_mean': 0.00025833158906607423, 'epoch': 0.09}
+
+ 20%|██        | 205/1024 [15:10:09<59:19:50, 260.79s/it][AINFO 12-02 16:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:52:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 206/1024 [15:15:05<61:42:55, 271.61s/it][A
+                                                         [A{'loss': 0.0644, 'grad_norm': 0.002378655131906271, 'learning_rate': 1e-05, 'num_tokens': 72288847.0, 'completions/mean_length': 7523.890625, 'completions/min_length': 493.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6773.03369140625, 'completions/min_terminated_length': 493.0, 'completions/max_terminated_length': 16337.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.4375, 'reward_std': 0.30038219690322876, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022135945037007332, 'sampling/sampling_logp_difference/max': 2.40518856048584, 'sampling/importance_sampling_ratio/min': 0.0902484729886055, 'sampling/importance_sampling_ratio/mean': 1.0001146793365479, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6128045171499252, 'clip_ratio/low_mean': 0.00026779471363624907, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.002622310712468e-05, 'clip_ratio/high_max': 0.0001482285506426706, 'clip_ratio/region_mean': 0.0003178209381076158, 'epoch': 0.09}
+
+ 20%|██        | 206/1024 [15:15:05<61:42:55, 271.61s/it][AINFO 12-02 16:57:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:57:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 207/1024 [15:20:12<63:59:23, 281.96s/it][A
+                                                         [A{'loss': 0.0927, 'grad_norm': 0.0025266362354159355, 'learning_rate': 1e-05, 'num_tokens': 72761669.0, 'completions/mean_length': 7232.34375, 'completions/min_length': 907.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 5924.96435546875, 'completions/min_terminated_length': 907.0, 'completions/max_terminated_length': 15701.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.34929439425468445, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018426189199090004, 'sampling/sampling_logp_difference/max': 15.687936782836914, 'sampling/importance_sampling_ratio/min': 1.5375019302155124e-07, 'sampling/importance_sampling_ratio/mean': 0.9999599456787109, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42693084105849266, 'clip_ratio/low_mean': 0.00027161065372638404, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.353595113570918e-05, 'clip_ratio/high_max': 0.0001411827925039688, 'clip_ratio/region_mean': 0.00031514659895037767, 'epoch': 0.1}
+
+ 20%|██        | 207/1024 [15:20:12<63:59:23, 281.96s/it][AINFO 12-02 17:02:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 208/1024 [15:24:57<64:08:10, 282.95s/it][A
+                                                         [A{'loss': 0.0525, 'grad_norm': 0.003213850548490882, 'learning_rate': 1e-05, 'num_tokens': 73202034.0, 'completions/mean_length': 6670.203125, 'completions/min_length': 772.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6192.47509765625, 'completions/min_terminated_length': 772.0, 'completions/max_terminated_length': 15941.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.35141605138778687, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01795230060815811, 'sampling/sampling_logp_difference/max': 4.202272891998291, 'sampling/importance_sampling_ratio/min': 0.01496153324842453, 'sampling/importance_sampling_ratio/mean': 0.9999494552612305, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.43840334936976433, 'clip_ratio/low_mean': 0.0001402696670993464, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.485467065431294e-05, 'clip_ratio/high_max': 0.00019446700571279507, 'clip_ratio/region_mean': 0.000205124337298912, 'epoch': 0.1}
+
+ 20%|██        | 208/1024 [15:24:57<64:08:10, 282.95s/it][AINFO 12-02 17:07:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:07:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:07:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:07:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|██        | 209/1024 [15:29:38<63:57:42, 282.53s/it][A
+                                                         [A{'loss': 0.0586, 'grad_norm': 0.0011513528879731894, 'learning_rate': 1e-05, 'num_tokens': 73620264.0, 'completions/mean_length': 6365.96875, 'completions/min_length': 527.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5516.98291015625, 'completions/min_terminated_length': 527.0, 'completions/max_terminated_length': 15056.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.3029785752296448, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021577036008238792, 'sampling/sampling_logp_difference/max': 9.410041809082031, 'sampling/importance_sampling_ratio/min': 8.189752406906337e-05, 'sampling/importance_sampling_ratio/mean': 0.9999346733093262, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.517903707921505, 'clip_ratio/low_mean': 7.256182152559632e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.714467235724442e-05, 'clip_ratio/high_max': 0.00022457982686319156, 'clip_ratio/region_mean': 0.00013970649433758808, 'epoch': 0.1}
+
+ 20%|██        | 209/1024 [15:29:38<63:57:42, 282.53s/it][AINFO 12-02 17:11:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:11:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 210/1024 [15:34:05<62:47:02, 277.67s/it][A
+                                                         [A{'loss': 0.1, 'grad_norm': 0.0017716643633320928, 'learning_rate': 1e-05, 'num_tokens': 73963706.0, 'completions/mean_length': 5223.40625, 'completions/min_length': 810.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4674.5244140625, 'completions/min_terminated_length': 810.0, 'completions/max_terminated_length': 15109.0, 'rewards/accuracy_reward/mean': 0.75, 'rewards/accuracy_reward/std': 0.4364357888698578, 'reward': 0.75, 'reward_std': 0.4308430552482605, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.01836252026259899, 'sampling/sampling_logp_difference/max': 4.468755722045898, 'sampling/importance_sampling_ratio/min': 0.01146156806498766, 'sampling/importance_sampling_ratio/mean': 0.999949038028717, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.51714938133955, 'clip_ratio/low_mean': 0.00011458138760644943, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.254700535166194e-05, 'clip_ratio/high_max': 0.00021729297077399679, 'clip_ratio/region_mean': 0.0002071283915938693, 'epoch': 0.1}
+
+ 21%|██        | 210/1024 [15:34:05<62:47:02, 277.67s/it][AINFO 12-02 17:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:16:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 211/1024 [15:37:45<58:51:12, 260.61s/it][A
+                                                         [A{'loss': 0.0019, 'grad_norm': 0.005028342362493277, 'learning_rate': 1e-05, 'num_tokens': 74249421.0, 'completions/mean_length': 4301.171875, 'completions/min_length': 629.0, 'completions/max_length': 12409.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4301.171875, 'completions/min_terminated_length': 629.0, 'completions/max_terminated_length': 12409.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.1462521106004715, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01590704917907715, 'sampling/sampling_logp_difference/max': 1.8422117233276367, 'sampling/importance_sampling_ratio/min': 0.15846656262874603, 'sampling/importance_sampling_ratio/mean': 0.9999874234199524, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4119699075818062, 'clip_ratio/low_mean': 0.00012882457031082595, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2612552356804372e-05, 'clip_ratio/high_max': 9.045020942721749e-05, 'clip_ratio/region_mean': 0.0001514371238044987, 'epoch': 0.1}
+
+ 21%|██        | 211/1024 [15:37:45<58:51:12, 260.61s/it][AINFO 12-02 17:19:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:19:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:19:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:19:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 212/1024 [15:41:42<57:07:17, 253.25s/it][A
+                                                         [A{'loss': 0.0484, 'grad_norm': 0.0019160618539899588, 'learning_rate': 1e-05, 'num_tokens': 74582886.0, 'completions/mean_length': 5072.265625, 'completions/min_length': 894.0, 'completions/max_length': 15408.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5072.265625, 'completions/min_terminated_length': 894.0, 'completions/max_terminated_length': 15408.0, 'rewards/accuracy_reward/mean': 0.6875, 'rewards/accuracy_reward/std': 0.467176616191864, 'reward': 0.6875, 'reward_std': 0.3119301199913025, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016023291274905205, 'sampling/sampling_logp_difference/max': 1.5214617252349854, 'sampling/importance_sampling_ratio/min': 0.21839243173599243, 'sampling/importance_sampling_ratio/mean': 0.999930202960968, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4638943448662758, 'clip_ratio/low_mean': 0.00016941237231549167, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 0.00016941237231549167, 'epoch': 0.1}
+
+ 21%|██        | 212/1024 [15:41:42<57:07:17, 253.25s/it][AINFO 12-02 17:23:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:23:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:23:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:23:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 213/1024 [15:46:01<57:30:13, 255.26s/it][A
+                                                         [A{'loss': 0.0793, 'grad_norm': 0.0031469049863517284, 'learning_rate': 1e-05, 'num_tokens': 74907010.0, 'completions/mean_length': 4939.3125, 'completions/min_length': 863.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4570.12890625, 'completions/min_terminated_length': 863.0, 'completions/max_terminated_length': 11472.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.3571978807449341, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01700844243168831, 'sampling/sampling_logp_difference/max': 10.868821144104004, 'sampling/importance_sampling_ratio/min': 1.9042805433855392e-05, 'sampling/importance_sampling_ratio/mean': 0.9998614192008972, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4833931587636471, 'clip_ratio/low_mean': 0.00024253790479633608, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.562458267784677e-05, 'clip_ratio/high_max': 0.000196728746232111, 'clip_ratio/region_mean': 0.0003181624888384249, 'epoch': 0.1}
+
+ 21%|██        | 213/1024 [15:46:01<57:30:13, 255.26s/it][AINFO 12-02 17:28:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 214/1024 [15:51:30<62:20:46, 277.09s/it][A
+                                                         [A{'loss': 0.0863, 'grad_norm': 0.00260414881631732, 'learning_rate': 1e-05, 'num_tokens': 75306035.0, 'completions/mean_length': 6087.765625, 'completions/min_length': 317.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5581.39306640625, 'completions/min_terminated_length': 317.0, 'completions/max_terminated_length': 16309.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.4375, 'reward_std': 0.41610968112945557, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018815182149410248, 'sampling/sampling_logp_difference/max': 1.514784812927246, 'sampling/importance_sampling_ratio/min': 0.21985548734664917, 'sampling/importance_sampling_ratio/mean': 1.0000262260437012, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5298577360808849, 'clip_ratio/low_mean': 0.0003473841352388263, 'clip_ratio/low_min': 1.1661535609164275e-05, 'clip_ratio/high_mean': 6.085088648433157e-05, 'clip_ratio/high_max': 0.00022859236833028262, 'clip_ratio/region_mean': 0.00040823502058628947, 'epoch': 0.1}
+
+ 21%|██        | 214/1024 [15:51:30<62:20:46, 277.09s/it][AINFO 12-02 17:33:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:33:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:33:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:33:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 215/1024 [15:53:42<52:32:57, 233.84s/it][A
+                                                         [A{'loss': -0.006, 'grad_norm': 0.0010832990519702435, 'learning_rate': 1e-05, 'num_tokens': 75514813.0, 'completions/mean_length': 3126.90625, 'completions/min_length': 599.0, 'completions/max_length': 9092.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3126.90625, 'completions/min_terminated_length': 599.0, 'completions/max_terminated_length': 9092.0, 'rewards/accuracy_reward/mean': 0.890625, 'rewards/accuracy_reward/std': 0.3145764470100403, 'reward': 0.890625, 'reward_std': 0.19939783215522766, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.013806769624352455, 'sampling/sampling_logp_difference/max': 3.897243022918701, 'sampling/importance_sampling_ratio/min': 0.02029779553413391, 'sampling/importance_sampling_ratio/mean': 0.9998923540115356, 'sampling/importance_sampling_ratio/max': 1.7046595811843872, 'entropy': 0.4343762621283531, 'clip_ratio/low_mean': 3.4744526601571124e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.080593165170285e-05, 'clip_ratio/high_max': 4.32237266068114e-05, 'clip_ratio/region_mean': 4.555045825327397e-05, 'epoch': 0.1}
+
+ 21%|██        | 215/1024 [15:53:42<52:32:57, 233.84s/it][AINFO 12-02 17:35:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:35:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:35:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:35:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 216/1024 [15:58:55<57:45:49, 257.36s/it][A
+                                                         [A{'loss': 0.0883, 'grad_norm': 0.0025122486986219883, 'learning_rate': 1e-05, 'num_tokens': 75986605.0, 'completions/mean_length': 7207.5, 'completions/min_length': 422.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6429.83056640625, 'completions/min_terminated_length': 422.0, 'completions/max_terminated_length': 15979.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.390625, 'reward_std': 0.3608325123786926, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01861385628581047, 'sampling/sampling_logp_difference/max': 5.642899513244629, 'sampling/importance_sampling_ratio/min': 0.0035425815731287003, 'sampling/importance_sampling_ratio/mean': 0.9997899532318115, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5835982710123062, 'clip_ratio/low_mean': 0.00020249037606845377, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.690190417022677e-05, 'clip_ratio/high_max': 0.00016449252689199056, 'clip_ratio/region_mean': 0.00024939227841969114, 'epoch': 0.1}
+
+ 21%|██        | 216/1024 [15:58:55<57:45:49, 257.36s/it][AINFO 12-02 17:41:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:41:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:41:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:41:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██        | 217/1024 [16:00:53<48:19:24, 215.57s/it][A
+                                                         [A{'loss': -0.0749, 'grad_norm': 0.0031897195149213076, 'learning_rate': 1e-05, 'num_tokens': 76121505.0, 'completions/mean_length': 1968.0625, 'completions/min_length': 275.0, 'completions/max_length': 8147.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 1968.0625, 'completions/min_terminated_length': 275.0, 'completions/max_terminated_length': 8147.0, 'rewards/accuracy_reward/mean': 0.78125, 'rewards/accuracy_reward/std': 0.4166666865348816, 'reward': 0.78125, 'reward_std': 0.37981897592544556, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.010531260631978512, 'sampling/sampling_logp_difference/max': 0.6905558109283447, 'sampling/importance_sampling_ratio/min': 0.5574459433555603, 'sampling/importance_sampling_ratio/mean': 1.0000404119491577, 'sampling/importance_sampling_ratio/max': 1.994823932647705, 'entropy': 0.2916927430778742, 'clip_ratio/low_mean': 6.944282631593524e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.915190311294282e-05, 'clip_ratio/high_max': 0.00014550567357218824, 'clip_ratio/region_mean': 0.00011859473033837276, 'epoch': 0.1}
+
+ 21%|██        | 217/1024 [16:00:53<48:19:24, 215.57s/it][AINFO 12-02 17:42:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 218/1024 [16:05:34<52:40:40, 235.29s/it][A
+                                                         [A{'loss': 0.0278, 'grad_norm': 0.006034399848431349, 'learning_rate': 1e-05, 'num_tokens': 76512557.0, 'completions/mean_length': 5962.9375, 'completions/min_length': 207.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5450.42578125, 'completions/min_terminated_length': 207.0, 'completions/max_terminated_length': 15719.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.26196980476379395, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01819833740592003, 'sampling/sampling_logp_difference/max': 4.1606597900390625, 'sampling/importance_sampling_ratio/min': 0.015597264282405376, 'sampling/importance_sampling_ratio/mean': 1.0000648498535156, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.45096710696816444, 'clip_ratio/low_mean': 0.00025042196011781925, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9753654618834844e-05, 'clip_ratio/high_max': 0.00014407207163458224, 'clip_ratio/region_mean': 0.00029017561428190675, 'epoch': 0.1}
+
+ 21%|██▏       | 218/1024 [16:05:34<52:40:40, 235.29s/it][AINFO 12-02 17:47:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:47:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:47:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:47:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 219/1024 [16:10:36<57:06:29, 255.39s/it][A
+                                                         [A{'loss': 0.0056, 'grad_norm': 0.002247847616672516, 'learning_rate': 1e-05, 'num_tokens': 77021585.0, 'completions/mean_length': 7763.8125, 'completions/min_length': 603.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7033.2880859375, 'completions/min_terminated_length': 603.0, 'completions/max_terminated_length': 15476.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.421875, 'reward_std': 0.3403330445289612, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019808072596788406, 'sampling/sampling_logp_difference/max': 2.816495418548584, 'sampling/importance_sampling_ratio/min': 0.05981520563364029, 'sampling/importance_sampling_ratio/mean': 0.9999669194221497, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4887936934828758, 'clip_ratio/low_mean': 0.0004555995328701101, 'clip_ratio/low_min': 8.552276631235145e-05, 'clip_ratio/high_mean': 4.485100043893908e-05, 'clip_ratio/high_max': 0.0001268659016204765, 'clip_ratio/region_mean': 0.0005004505364922807, 'epoch': 0.1}
+
+ 21%|██▏       | 219/1024 [16:10:36<57:06:29, 255.39s/it][AINFO 12-02 17:52:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:52:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:52:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:52:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 21%|██▏       | 220/1024 [16:13:36<51:59:00, 232.76s/it][A
+                                                         [A{'loss': 0.0126, 'grad_norm': 0.005488593131303787, 'learning_rate': 1e-05, 'num_tokens': 77278568.0, 'completions/mean_length': 3867.859375, 'completions/min_length': 899.0, 'completions/max_length': 11679.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3867.859375, 'completions/min_terminated_length': 899.0, 'completions/max_terminated_length': 11679.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.4604927599430084, 'reward': 0.703125, 'reward_std': 0.3266732692718506, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016852227970957756, 'sampling/sampling_logp_difference/max': 0.9524774551391602, 'sampling/importance_sampling_ratio/min': 0.3857840597629547, 'sampling/importance_sampling_ratio/mean': 1.0000158548355103, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4654340222477913, 'clip_ratio/low_mean': 7.476849577869871e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.593558608372405e-05, 'clip_ratio/high_max': 0.00010134770491276868, 'clip_ratio/region_mean': 0.00011070408254454378, 'epoch': 0.1}
+
+ 21%|██▏       | 220/1024 [16:13:36<51:59:00, 232.76s/it][AINFO 12-02 17:55:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:55:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:55:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:55:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 221/1024 [16:18:12<54:48:10, 245.69s/it][A
+                                                         [A{'loss': 0.0439, 'grad_norm': 0.0006666425615549088, 'learning_rate': 1e-05, 'num_tokens': 77694486.0, 'completions/mean_length': 6363.21875, 'completions/min_length': 724.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6039.9677734375, 'completions/min_terminated_length': 724.0, 'completions/max_terminated_length': 14804.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.2619796395301819, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0196120273321867, 'sampling/sampling_logp_difference/max': 3.8735718727111816, 'sampling/importance_sampling_ratio/min': 0.02078399807214737, 'sampling/importance_sampling_ratio/mean': 1.000156283378601, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5168523639440536, 'clip_ratio/low_mean': 0.00010571115899438155, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3654389653747785e-05, 'clip_ratio/high_max': 0.00013461755861499114, 'clip_ratio/region_mean': 0.00013936555023974506, 'epoch': 0.1}
+
+ 22%|██▏       | 221/1024 [16:18:12<54:48:10, 245.69s/it][AINFO 12-02 18:00:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:00:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 222/1024 [16:22:59<57:30:42, 258.16s/it][A
+                                                         [A{'loss': -0.0126, 'grad_norm': 0.0021630642004311085, 'learning_rate': 1e-05, 'num_tokens': 78106888.0, 'completions/mean_length': 6292.15625, 'completions/min_length': 462.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5436.9150390625, 'completions/min_terminated_length': 462.0, 'completions/max_terminated_length': 14953.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5039526224136353, 'reward': 0.5, 'reward_std': 0.3977220952510834, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019118161872029305, 'sampling/sampling_logp_difference/max': 3.713104724884033, 'sampling/importance_sampling_ratio/min': 0.024401644244790077, 'sampling/importance_sampling_ratio/mean': 0.9999501705169678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4610734060406685, 'clip_ratio/low_mean': 0.00031171551290753996, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2291413958773774e-05, 'clip_ratio/high_max': 0.00011977986332567525, 'clip_ratio/region_mean': 0.0003440069303906057, 'epoch': 0.1}
+
+ 22%|██▏       | 222/1024 [16:22:59<57:30:42, 258.16s/it][AINFO 12-02 18:05:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:05:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 223/1024 [16:26:32<54:25:05, 244.58s/it][A
+                                                         [A{'loss': -0.0027, 'grad_norm': 0.0009040985605679452, 'learning_rate': 1e-05, 'num_tokens': 78482551.0, 'completions/mean_length': 5724.609375, 'completions/min_length': 767.0, 'completions/max_length': 13437.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5724.609375, 'completions/min_terminated_length': 767.0, 'completions/max_terminated_length': 13437.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.24359199404716492, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01790982484817505, 'sampling/sampling_logp_difference/max': 1.5185266733169556, 'sampling/importance_sampling_ratio/min': 0.2190343737602234, 'sampling/importance_sampling_ratio/mean': 1.000087022781372, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5190568938851357, 'clip_ratio/low_mean': 7.580990495625883e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.422942879500624e-05, 'clip_ratio/high_max': 0.00012096901355107548, 'clip_ratio/region_mean': 0.00012003933125015465, 'epoch': 0.1}
+
+ 22%|██▏       | 223/1024 [16:26:32<54:25:05, 244.58s/it][AINFO 12-02 18:08:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:08:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 224/1024 [16:31:07<56:21:38, 253.62s/it][A
+                                                         [A{'loss': 0.0719, 'grad_norm': 0.0016899100737646222, 'learning_rate': 1e-05, 'num_tokens': 78937977.0, 'completions/mean_length': 6930.15625, 'completions/min_length': 1005.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6465.212890625, 'completions/min_terminated_length': 1005.0, 'completions/max_terminated_length': 15155.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.3845370411872864, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01773625612258911, 'sampling/sampling_logp_difference/max': 3.2968716621398926, 'sampling/importance_sampling_ratio/min': 0.03699873015284538, 'sampling/importance_sampling_ratio/mean': 0.9999412298202515, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42476192489266396, 'clip_ratio/low_mean': 0.00020643750076487777, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.323127259591274e-05, 'clip_ratio/high_max': 0.0001084450609596388, 'clip_ratio/region_mean': 0.00023966876688064076, 'epoch': 0.1}
+
+ 22%|██▏       | 224/1024 [16:31:07<56:21:38, 253.62s/it][AINFO 12-02 18:13:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:13:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 225/1024 [16:35:39<57:29:04, 259.00s/it][A
+                                                         [A{'loss': 0.0663, 'grad_norm': 0.0025020185858011246, 'learning_rate': 1e-05, 'num_tokens': 79311635.0, 'completions/mean_length': 5680.53125, 'completions/min_length': 869.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5154.130859375, 'completions/min_terminated_length': 869.0, 'completions/max_terminated_length': 15218.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.29826053977012634, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017010383307933807, 'sampling/sampling_logp_difference/max': 1.762050986289978, 'sampling/importance_sampling_ratio/min': 0.1716923713684082, 'sampling/importance_sampling_ratio/mean': 0.9999614953994751, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4466409422457218, 'clip_ratio/low_mean': 0.00017441119507566327, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.650019405336934e-05, 'clip_ratio/high_max': 9.090643652598374e-05, 'clip_ratio/region_mean': 0.00020091138776479056, 'epoch': 0.1}
+
+ 22%|██▏       | 225/1024 [16:35:39<57:29:04, 259.00s/it][AINFO 12-02 18:17:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:17:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:17:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:17:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 226/1024 [16:40:33<59:46:54, 269.69s/it][A
+                                                         [A{'loss': 0.0454, 'grad_norm': 0.0024153508711606264, 'learning_rate': 1e-05, 'num_tokens': 79765726.0, 'completions/mean_length': 6892.546875, 'completions/min_length': 580.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 5910.67236328125, 'completions/min_terminated_length': 580.0, 'completions/max_terminated_length': 15825.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.24039676785469055, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019792016595602036, 'sampling/sampling_logp_difference/max': 12.81136417388916, 'sampling/importance_sampling_ratio/min': 2.729576408455614e-06, 'sampling/importance_sampling_ratio/mean': 1.0001287460327148, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.49120457097887993, 'clip_ratio/low_mean': 0.00021486065270437393, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.9704782782100665e-05, 'clip_ratio/high_max': 0.0001514997079539171, 'clip_ratio/region_mean': 0.0002645654349180404, 'epoch': 0.1}
+
+ 22%|██▏       | 226/1024 [16:40:33<59:46:54, 269.69s/it][AINFO 12-02 18:22:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:22:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:22:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:22:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 227/1024 [16:45:22<60:59:19, 275.48s/it][A
+                                                         [A{'loss': 0.08, 'grad_norm': 0.0017007271526381373, 'learning_rate': 1e-05, 'num_tokens': 80109386.0, 'completions/mean_length': 5207.4375, 'completions/min_length': 590.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4260.27099609375, 'completions/min_terminated_length': 590.0, 'completions/max_terminated_length': 13895.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.3471629321575165, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.013319926336407661, 'sampling/sampling_logp_difference/max': 2.4354448318481445, 'sampling/importance_sampling_ratio/min': 0.08755878359079361, 'sampling/importance_sampling_ratio/mean': 1.0000121593475342, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3318914733827114, 'clip_ratio/low_mean': 0.00019377954231458716, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4961794995069795e-05, 'clip_ratio/high_max': 0.00012746235961458297, 'clip_ratio/region_mean': 0.00022874133810546482, 'epoch': 0.1}
+
+ 22%|██▏       | 227/1024 [16:45:22<60:59:19, 275.48s/it][AINFO 12-02 18:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:27:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:27:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 228/1024 [16:50:06<61:29:43, 278.12s/it][A
+                                                         [A{'loss': 0.0903, 'grad_norm': 0.0025096102617681026, 'learning_rate': 1e-05, 'num_tokens': 80411555.0, 'completions/mean_length': 4586.765625, 'completions/min_length': 319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4006.573486328125, 'completions/min_terminated_length': 319.0, 'completions/max_terminated_length': 16267.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.3140517771244049, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01616336964070797, 'sampling/sampling_logp_difference/max': 1.4468202590942383, 'sampling/importance_sampling_ratio/min': 0.23559969663619995, 'sampling/importance_sampling_ratio/mean': 0.9999880790710449, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40994085371494293, 'clip_ratio/low_mean': 0.00022203384924068814, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2490278954355745e-05, 'clip_ratio/high_max': 0.00011214469668630045, 'clip_ratio/region_mean': 0.00025452412955928594, 'epoch': 0.1}
+
+ 22%|██▏       | 228/1024 [16:50:06<61:29:43, 278.12s/it][AINFO 12-02 18:32:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:32:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:32:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:32:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 229/1024 [16:54:52<61:53:59, 280.30s/it][A
+                                                         [A{'loss': 0.1482, 'grad_norm': 0.0042454032227396965, 'learning_rate': 1e-05, 'num_tokens': 80714004.0, 'completions/mean_length': 4593.765625, 'completions/min_length': 369.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4406.619140625, 'completions/min_terminated_length': 369.0, 'completions/max_terminated_length': 16056.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.45026895403862, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015605361200869083, 'sampling/sampling_logp_difference/max': 2.078511953353882, 'sampling/importance_sampling_ratio/min': 0.1251162588596344, 'sampling/importance_sampling_ratio/mean': 0.9998003244400024, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4641117826104164, 'clip_ratio/low_mean': 0.0001495746337241144, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.579360868068761e-05, 'clip_ratio/high_max': 0.00017713156830723165, 'clip_ratio/region_mean': 0.00020536824285954935, 'epoch': 0.11}
+
+ 22%|██▏       | 229/1024 [16:54:52<61:53:59, 280.30s/it][AINFO 12-02 18:36:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:36:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 230/1024 [16:59:36<62:03:49, 281.40s/it][A
+                                                         [A{'loss': 0.0922, 'grad_norm': 0.000982000376097858, 'learning_rate': 1e-05, 'num_tokens': 81007229.0, 'completions/mean_length': 4442.015625, 'completions/min_length': 501.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 3854.70458984375, 'completions/min_terminated_length': 501.0, 'completions/max_terminated_length': 15370.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.23568856716156006, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.014811690896749496, 'sampling/sampling_logp_difference/max': 1.1643218994140625, 'sampling/importance_sampling_ratio/min': 0.3121342360973358, 'sampling/importance_sampling_ratio/mean': 1.0000309944152832, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4759037382900715, 'clip_ratio/low_mean': 0.00010964347529807128, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3575485809269594e-05, 'clip_ratio/high_max': 5.4301943237078376e-05, 'clip_ratio/region_mean': 0.00012321896201683558, 'epoch': 0.11}
+
+ 22%|██▏       | 230/1024 [16:59:36<62:03:49, 281.40s/it][AINFO 12-02 18:41:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:41:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:41:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:41:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 231/1024 [17:04:18<62:02:20, 281.64s/it][A
+                                                         [A{'loss': -0.0108, 'grad_norm': 0.0014373867306858301, 'learning_rate': 1e-05, 'num_tokens': 81422826.0, 'completions/mean_length': 6337.203125, 'completions/min_length': 862.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6013.11279296875, 'completions/min_terminated_length': 862.0, 'completions/max_terminated_length': 16015.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.359375, 'reward_std': 0.29355230927467346, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02140781655907631, 'sampling/sampling_logp_difference/max': 3.3715882301330566, 'sampling/importance_sampling_ratio/min': 0.03433506190776825, 'sampling/importance_sampling_ratio/mean': 1.000078797340393, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5657722055912018, 'clip_ratio/low_mean': 0.00017673981255938997, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6067098133353284e-05, 'clip_ratio/high_max': 8.269613863376435e-05, 'clip_ratio/region_mean': 0.0002028069102379959, 'epoch': 0.11}
+
+ 23%|██▎       | 231/1024 [17:04:18<62:02:20, 281.64s/it][AINFO 12-02 18:46:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:46:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:46:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:46:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 232/1024 [17:09:01<62:01:49, 281.96s/it][A
+                                                         [A{'loss': 0.1738, 'grad_norm': 0.0015173805877566338, 'learning_rate': 1e-05, 'num_tokens': 81874521.0, 'completions/mean_length': 6890.484375, 'completions/min_length': 1082.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6257.58349609375, 'completions/min_terminated_length': 1082.0, 'completions/max_terminated_length': 15230.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.400318443775177, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019038155674934387, 'sampling/sampling_logp_difference/max': 1.2821269035339355, 'sampling/importance_sampling_ratio/min': 0.3208940327167511, 'sampling/importance_sampling_ratio/mean': 0.9999370574951172, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.49876927584409714, 'clip_ratio/low_mean': 0.00023489978411816992, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.762118803431804e-05, 'clip_ratio/high_max': 0.00016791999496490462, 'clip_ratio/region_mean': 0.00029252096464915667, 'epoch': 0.11}
+
+ 23%|██▎       | 232/1024 [17:09:01<62:01:49, 281.96s/it][AINFO 12-02 18:51:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:51:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:51:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:51:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 233/1024 [17:14:33<65:17:49, 297.18s/it][A
+                                                         [A{'loss': 0.0279, 'grad_norm': 0.0017784368246793747, 'learning_rate': 1e-05, 'num_tokens': 82278610.0, 'completions/mean_length': 6170.140625, 'completions/min_length': 681.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 4711.01806640625, 'completions/min_terminated_length': 681.0, 'completions/max_terminated_length': 16191.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4917473793029785, 'reward': 0.609375, 'reward_std': 0.2109457403421402, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018276207149028778, 'sampling/sampling_logp_difference/max': 1.548595666885376, 'sampling/importance_sampling_ratio/min': 0.2725621461868286, 'sampling/importance_sampling_ratio/mean': 0.999990701675415, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5418903082609177, 'clip_ratio/low_mean': 0.00010576232944004005, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3737378796795383e-05, 'clip_ratio/high_max': 8.97285640348855e-05, 'clip_ratio/region_mean': 0.00012949970914633013, 'epoch': 0.11}
+
+ 23%|██▎       | 233/1024 [17:14:33<65:17:49, 297.18s/it][AINFO 12-02 18:56:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:56:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:56:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:56:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 234/1024 [17:19:28<65:00:51, 296.27s/it][A
+                                                         [A{'loss': 0.0284, 'grad_norm': 0.004276667255908251, 'learning_rate': 1e-05, 'num_tokens': 82736367.0, 'completions/mean_length': 6991.953125, 'completions/min_length': 491.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6842.87353515625, 'completions/min_terminated_length': 491.0, 'completions/max_terminated_length': 15899.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.3776973485946655, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019142955541610718, 'sampling/sampling_logp_difference/max': 1.8119025230407715, 'sampling/importance_sampling_ratio/min': 0.16334307193756104, 'sampling/importance_sampling_ratio/mean': 0.9999826550483704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.47046585008502007, 'clip_ratio/low_mean': 0.0004091848904863582, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.763163653682568e-05, 'clip_ratio/high_max': 0.00017815040882851463, 'clip_ratio/region_mean': 0.0004668165192924789, 'epoch': 0.11}
+
+ 23%|██▎       | 234/1024 [17:19:28<65:00:51, 296.27s/it][AINFO 12-02 19:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:01:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:01:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 235/1024 [17:23:18<60:34:21, 276.38s/it][A
+                                                         [A{'loss': -0.018, 'grad_norm': 0.0038484896067529917, 'learning_rate': 1e-05, 'num_tokens': 83052079.0, 'completions/mean_length': 4743.875, 'completions/min_length': 950.0, 'completions/max_length': 13924.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4743.875, 'completions/min_terminated_length': 950.0, 'completions/max_terminated_length': 13924.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.2756394147872925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017475087195634842, 'sampling/sampling_logp_difference/max': 3.6536145210266113, 'sampling/importance_sampling_ratio/min': 0.025897353887557983, 'sampling/importance_sampling_ratio/mean': 1.0001256465911865, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6164797060191631, 'clip_ratio/low_mean': 0.00011719972371793119, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.201239926260314e-05, 'clip_ratio/high_max': 0.00011669618379528401, 'clip_ratio/region_mean': 0.00015921212252578698, 'epoch': 0.11}
+
+ 23%|██▎       | 235/1024 [17:23:18<60:34:21, 276.38s/it][AINFO 12-02 19:05:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:05:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:05:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:05:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 236/1024 [17:27:38<59:28:38, 271.72s/it][A
+                                                         [A{'loss': 0.0746, 'grad_norm': 0.0007693638326600194, 'learning_rate': 1e-05, 'num_tokens': 83373834.0, 'completions/mean_length': 4863.296875, 'completions/min_length': 428.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4680.4287109375, 'completions/min_terminated_length': 428.0, 'completions/max_terminated_length': 15423.0, 'rewards/accuracy_reward/mean': 0.765625, 'rewards/accuracy_reward/std': 0.42695629596710205, 'reward': 0.765625, 'reward_std': 0.21778544783592224, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.016916219145059586, 'sampling/sampling_logp_difference/max': 3.508268356323242, 'sampling/importance_sampling_ratio/min': 0.029948730021715164, 'sampling/importance_sampling_ratio/mean': 0.9999876022338867, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.44430097937583923, 'clip_ratio/low_mean': 9.41199950830196e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.1069131839321926e-05, 'clip_ratio/high_max': 0.00015552915283478796, 'clip_ratio/region_mean': 0.00013518913056032034, 'epoch': 0.11}
+
+ 23%|██▎       | 236/1024 [17:27:38<59:28:38, 271.72s/it][AINFO 12-02 19:09:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:09:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 237/1024 [17:32:46<61:43:41, 282.37s/it][A
+                                                         [A{'loss': -0.012, 'grad_norm': 0.003060512710362673, 'learning_rate': 1e-05, 'num_tokens': 83711289.0, 'completions/mean_length': 5122.734375, 'completions/min_length': 714.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 3957.77587890625, 'completions/min_terminated_length': 714.0, 'completions/max_terminated_length': 16048.0, 'rewards/accuracy_reward/mean': 0.6875, 'rewards/accuracy_reward/std': 0.467176616191864, 'reward': 0.6875, 'reward_std': 0.2041158676147461, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.014982285909354687, 'sampling/sampling_logp_difference/max': 1.6765105724334717, 'sampling/importance_sampling_ratio/min': 0.314321905374527, 'sampling/importance_sampling_ratio/mean': 1.0000354051589966, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42493540421128273, 'clip_ratio/low_mean': 4.570606643028441e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1834867766301613e-05, 'clip_ratio/high_max': 6.717119504173752e-05, 'clip_ratio/region_mean': 6.754093374183867e-05, 'epoch': 0.11}
+
+ 23%|██▎       | 237/1024 [17:32:46<61:43:41, 282.37s/it][AINFO 12-02 19:14:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:14:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 238/1024 [17:37:33<61:58:12, 283.83s/it][A
+                                                         [A{'loss': 0.1429, 'grad_norm': 0.0029973885975778103, 'learning_rate': 1e-05, 'num_tokens': 84106199.0, 'completions/mean_length': 6021.84375, 'completions/min_length': 695.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5143.69482421875, 'completions/min_terminated_length': 695.0, 'completions/max_terminated_length': 14767.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.4187060594558716, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017083361744880676, 'sampling/sampling_logp_difference/max': 2.1262426376342773, 'sampling/importance_sampling_ratio/min': 0.11928464472293854, 'sampling/importance_sampling_ratio/mean': 1.0000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4496769346296787, 'clip_ratio/low_mean': 0.0001812346436054213, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.806560008953966e-05, 'clip_ratio/high_max': 0.00019947911187045975, 'clip_ratio/region_mean': 0.00024930024392233463, 'epoch': 0.11}
+
+ 23%|██▎       | 238/1024 [17:37:33<61:58:12, 283.83s/it][AINFO 12-02 19:19:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:19:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 239/1024 [17:42:39<63:19:41, 290.42s/it][A
+                                                         [A{'loss': 0.0419, 'grad_norm': 0.002654111245647073, 'learning_rate': 1e-05, 'num_tokens': 84557219.0, 'completions/mean_length': 6895.8125, 'completions/min_length': 1234.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6589.74169921875, 'completions/min_terminated_length': 1234.0, 'completions/max_terminated_length': 15261.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.484375, 'reward_std': 0.22097086906433105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01897004060447216, 'sampling/sampling_logp_difference/max': 3.1335809230804443, 'sampling/importance_sampling_ratio/min': 0.04356152564287186, 'sampling/importance_sampling_ratio/mean': 0.9999165534973145, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6133008375763893, 'clip_ratio/low_mean': 0.00017238608097613906, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0850019179997616e-05, 'clip_ratio/high_max': 3.683115664898651e-05, 'clip_ratio/region_mean': 0.0001832361008382577, 'epoch': 0.11}
+
+ 23%|██▎       | 239/1024 [17:42:39<63:19:41, 290.42s/it][AINFO 12-02 19:24:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:24:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:24:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:24:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 23%|██▎       | 240/1024 [17:46:58<61:13:33, 281.14s/it][A
+                                                         [A{'loss': 0.0471, 'grad_norm': 0.003680360270664096, 'learning_rate': 1e-05, 'num_tokens': 84821268.0, 'completions/mean_length': 3918.140625, 'completions/min_length': 888.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 3720.27001953125, 'completions/min_terminated_length': 888.0, 'completions/max_terminated_length': 15268.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.3424547016620636, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.014424344524741173, 'sampling/sampling_logp_difference/max': 1.520143985748291, 'sampling/importance_sampling_ratio/min': 0.21868041157722473, 'sampling/importance_sampling_ratio/mean': 0.9999188184738159, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.356719171628356, 'clip_ratio/low_mean': 0.00021527314493141603, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.838759528207447e-05, 'clip_ratio/high_max': 0.00014071275063542998, 'clip_ratio/region_mean': 0.00025366073987243, 'epoch': 0.11}
+
+ 23%|██▎       | 240/1024 [17:46:58<61:13:33, 281.14s/it][AINFO 12-02 19:29:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:29:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:29:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:29:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 241/1024 [17:51:34<60:46:48, 279.45s/it][A
+                                                         [A{'loss': 0.0735, 'grad_norm': 0.0013610277092084289, 'learning_rate': 1e-05, 'num_tokens': 85258670.0, 'completions/mean_length': 6668.15625, 'completions/min_length': 636.0, 'completions/max_length': 15902.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6668.15625, 'completions/min_terminated_length': 636.0, 'completions/max_terminated_length': 15902.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.34717273712158203, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018126193434000015, 'sampling/sampling_logp_difference/max': 1.6091341972351074, 'sampling/importance_sampling_ratio/min': 0.20006075501441956, 'sampling/importance_sampling_ratio/mean': 0.9999465942382812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5119834914803505, 'clip_ratio/low_mean': 0.00015056641859700903, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.203770933803753e-05, 'clip_ratio/high_max': 0.00014035894673725124, 'clip_ratio/region_mean': 0.00020260413566575153, 'epoch': 0.11}
+
+ 24%|██▎       | 241/1024 [17:51:34<60:46:48, 279.45s/it][AINFO 12-02 19:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:33:39 [block_pool.py:292] Successfully reset prefix cache
+[OpenTinker] 2025-12-02 19:36:23,679 - math_verify.grader - WARNING - Timeout during comparison
+
+ 24%|██▎       | 242/1024 [17:56:07<60:18:41, 277.65s/it][A
+                                                         [A{'loss': -0.0433, 'grad_norm': 0.0012310061138123274, 'learning_rate': 1e-05, 'num_tokens': 85600990.0, 'completions/mean_length': 5203.625, 'completions/min_length': 694.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 4842.9677734375, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 16281.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.2619796395301819, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01749185100197792, 'sampling/sampling_logp_difference/max': 1.6309679746627808, 'sampling/importance_sampling_ratio/min': 0.1957399994134903, 'sampling/importance_sampling_ratio/mean': 0.9998469352722168, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4269341044127941, 'clip_ratio/low_mean': 0.0002824985767801991, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3161428771345527e-05, 'clip_ratio/high_max': 7.972180992510403e-05, 'clip_ratio/region_mean': 0.0003056600053241709, 'epoch': 0.11}
+
+ 24%|██▎       | 242/1024 [17:56:07<60:18:41, 277.65s/it][AINFO 12-02 19:38:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:38:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:38:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:38:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▎       | 243/1024 [18:01:24<62:47:28, 289.43s/it][A
+                                                         [A{'loss': 0.1244, 'grad_norm': 0.0016366615891456604, 'learning_rate': 1e-05, 'num_tokens': 86196835.0, 'completions/mean_length': 9127.828125, 'completions/min_length': 479.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.15625, 'completions/mean_terminated_length': 7784.0927734375, 'completions/min_terminated_length': 479.0, 'completions/max_terminated_length': 15294.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5039526224136353, 'reward': 0.5, 'reward_std': 0.3424547016620636, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017813922837376595, 'sampling/sampling_logp_difference/max': 3.3963608741760254, 'sampling/importance_sampling_ratio/min': 0.03349493816494942, 'sampling/importance_sampling_ratio/mean': 0.9999416470527649, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4373306594789028, 'clip_ratio/low_mean': 0.0004982586724509019, 'clip_ratio/low_min': 8.9730183390202e-05, 'clip_ratio/high_mean': 3.4745204743558133e-05, 'clip_ratio/high_max': 0.00010031662395704188, 'clip_ratio/region_mean': 0.0005330038620741107, 'epoch': 0.11}
+
+ 24%|██▎       | 243/1024 [18:01:24<62:47:28, 289.43s/it][AINFO 12-02 19:43:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:43:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:43:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:43:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 244/1024 [18:06:29<63:43:22, 294.11s/it][A
+                                                         [A{'loss': 0.0342, 'grad_norm': 0.0011659173760563135, 'learning_rate': 1e-05, 'num_tokens': 86692932.0, 'completions/mean_length': 7613.890625, 'completions/min_length': 723.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6870.6611328125, 'completions/min_terminated_length': 723.0, 'completions/max_terminated_length': 15947.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.501733124256134, 'reward': 0.453125, 'reward_std': 0.28460076451301575, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02279534563422203, 'sampling/sampling_logp_difference/max': 1.6359646320343018, 'sampling/importance_sampling_ratio/min': 0.1947644054889679, 'sampling/importance_sampling_ratio/mean': 1.000012755393982, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7129848003387451, 'clip_ratio/low_mean': 0.00019994324975414202, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.042230127903167e-05, 'clip_ratio/high_max': 0.00010366754941060208, 'clip_ratio/region_mean': 0.0002303655564901419, 'epoch': 0.11}
+
+ 24%|██▍       | 244/1024 [18:06:29<63:43:22, 294.11s/it][AINFO 12-02 19:48:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:48:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:48:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:48:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 245/1024 [18:11:09<62:43:39, 289.88s/it][A
+                                                         [A{'loss': 0.1398, 'grad_norm': 0.0018372249323874712, 'learning_rate': 1e-05, 'num_tokens': 87101238.0, 'completions/mean_length': 6233.40625, 'completions/min_length': 673.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5373.1865234375, 'completions/min_terminated_length': 673.0, 'completions/max_terminated_length': 14755.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.24831004440784454, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018598943948745728, 'sampling/sampling_logp_difference/max': 5.0596394538879395, 'sampling/importance_sampling_ratio/min': 0.006347848102450371, 'sampling/importance_sampling_ratio/mean': 1.0000430345535278, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.49797332659363747, 'clip_ratio/low_mean': 0.0001352443855466845, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.645371009497467e-05, 'clip_ratio/high_max': 0.00012050778286720742, 'clip_ratio/region_mean': 0.00017169809234474087, 'epoch': 0.11}
+
+ 24%|██▍       | 245/1024 [18:11:09<62:43:39, 289.88s/it][AINFO 12-02 19:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:53:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 246/1024 [18:16:37<65:06:25, 301.27s/it][A
+                                                         [A{'loss': 0.071, 'grad_norm': 0.0006606021779589355, 'learning_rate': 1e-05, 'num_tokens': 87712201.0, 'completions/mean_length': 9391.296875, 'completions/min_length': 1343.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.140625, 'completions/mean_terminated_length': 8247.0361328125, 'completions/min_terminated_length': 1343.0, 'completions/max_terminated_length': 16348.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5037065148353577, 'reward': 0.515625, 'reward_std': 0.308285653591156, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021746061742305756, 'sampling/sampling_logp_difference/max': 5.540142059326172, 'sampling/importance_sampling_ratio/min': 0.003925969358533621, 'sampling/importance_sampling_ratio/mean': 0.9999644756317139, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5225642211735249, 'clip_ratio/low_mean': 0.0003432221692492021, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.907059741934063e-05, 'clip_ratio/high_max': 0.00017326994475297397, 'clip_ratio/region_mean': 0.00041229276212106925, 'epoch': 0.11}
+
+ 24%|██▍       | 246/1024 [18:16:37<65:06:25, 301.27s/it][AINFO 12-02 19:58:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 19:58:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 247/1024 [18:21:06<62:55:43, 291.56s/it][A
+                                                         [A{'loss': 0.0139, 'grad_norm': 0.0, 'learning_rate': 1e-05, 'num_tokens': 87977648.0, 'completions/mean_length': 3964.234375, 'completions/min_length': 717.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 3563.5966796875, 'completions/min_terminated_length': 717.0, 'completions/max_terminated_length': 16152.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.04419417306780815, 'frac_reward_zero_std': 0.875, 'sampling/sampling_logp_difference/mean': 0.01712377555668354, 'sampling/sampling_logp_difference/max': 4.13956880569458, 'sampling/importance_sampling_ratio/min': 0.01592971943318844, 'sampling/importance_sampling_ratio/mean': 1.0000314712524414, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.525791771709919, 'clip_ratio/low_mean': 0.0, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.007304818922421e-06, 'clip_ratio/high_max': 2.4029219275689684e-05, 'clip_ratio/region_mean': 6.007304818922421e-06, 'epoch': 0.11}
+
+ 24%|██▍       | 247/1024 [18:21:06<62:55:43, 291.56s/it][AINFO 12-02 20:03:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:03:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 248/1024 [18:25:52<62:28:18, 289.82s/it][A
+                                                         [A{'loss': 0.0084, 'grad_norm': 0.002795964479446411, 'learning_rate': 1e-05, 'num_tokens': 88353853.0, 'completions/mean_length': 5745.203125, 'completions/min_length': 481.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5221.9833984375, 'completions/min_terminated_length': 481.0, 'completions/max_terminated_length': 16206.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017458371818065643, 'sampling/sampling_logp_difference/max': 2.2369706630706787, 'sampling/importance_sampling_ratio/min': 0.1067814901471138, 'sampling/importance_sampling_ratio/mean': 1.0000864267349243, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4876930043101311, 'clip_ratio/low_mean': 0.00023576771945954533, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.8121013125855825e-05, 'clip_ratio/high_max': 0.0001948268791238661, 'clip_ratio/region_mean': 0.00029388873099378543, 'epoch': 0.11}
+
+ 24%|██▍       | 248/1024 [18:25:52<62:28:18, 289.82s/it][AINFO 12-02 20:07:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:07:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:07:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:07:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 249/1024 [18:30:39<62:15:04, 289.17s/it][A
+                                                         [A{'loss': 0.0524, 'grad_norm': 0.0021594560239464045, 'learning_rate': 1e-05, 'num_tokens': 88788661.0, 'completions/mean_length': 6539.5, 'completions/min_length': 1223.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5883.2001953125, 'completions/min_terminated_length': 1223.0, 'completions/max_terminated_length': 15602.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.3514062464237213, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.015474091283977032, 'sampling/sampling_logp_difference/max': 5.499971389770508, 'sampling/importance_sampling_ratio/min': 0.004086888395249844, 'sampling/importance_sampling_ratio/mean': 1.0001180171966553, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4174750857055187, 'clip_ratio/low_mean': 0.0002118818921417187, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2539359608563245e-05, 'clip_ratio/high_max': 9.58418977461406e-05, 'clip_ratio/region_mean': 0.00024442124777124263, 'epoch': 0.11}
+
+ 24%|██▍       | 249/1024 [18:30:39<62:15:04, 289.17s/it][AINFO 12-02 20:12:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:12:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:12:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:12:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 24%|██▍       | 250/1024 [18:34:51<59:45:34, 277.95s/it][A
+                                                         [A{'loss': 0.0852, 'grad_norm': 0.0036405690480023623, 'learning_rate': 1e-05, 'num_tokens': 89098311.0, 'completions/mean_length': 4678.78125, 'completions/min_length': 870.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4492.984375, 'completions/min_terminated_length': 870.0, 'completions/max_terminated_length': 15597.0, 'rewards/accuracy_reward/mean': 0.859375, 'rewards/accuracy_reward/std': 0.3503824472427368, 'reward': 0.859375, 'reward_std': 0.2198973000049591, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01609473116695881, 'sampling/sampling_logp_difference/max': 2.2190399169921875, 'sampling/importance_sampling_ratio/min': 0.10871343314647675, 'sampling/importance_sampling_ratio/mean': 0.9999558925628662, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4138566628098488, 'clip_ratio/low_mean': 0.00014659917678727652, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1253723818736034e-05, 'clip_ratio/high_max': 9.525653422315372e-05, 'clip_ratio/region_mean': 0.00017785290037863888, 'epoch': 0.11}
+
+ 24%|██▍       | 250/1024 [18:34:51<59:45:34, 277.95s/it][AINFO 12-02 20:16:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:16:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:16:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:16:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 251/1024 [18:40:33<63:47:11, 297.07s/it][A
+                                                         [A{'loss': 0.0823, 'grad_norm': 0.0033660451881587505, 'learning_rate': 1e-05, 'num_tokens': 89700215.0, 'completions/mean_length': 9259.25, 'completions/min_length': 621.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.171875, 'completions/mean_terminated_length': 7780.5283203125, 'completions/min_terminated_length': 621.0, 'completions/max_terminated_length': 16177.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.421875, 'reward_std': 0.31983357667922974, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01940334215760231, 'sampling/sampling_logp_difference/max': 2.2370476722717285, 'sampling/importance_sampling_ratio/min': 0.1067732647061348, 'sampling/importance_sampling_ratio/mean': 0.9999710321426392, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4665018580853939, 'clip_ratio/low_mean': 0.00038837966985738603, 'clip_ratio/low_min': 5.279729521134868e-05, 'clip_ratio/high_mean': 3.281560680079565e-05, 'clip_ratio/high_max': 9.14018673938699e-05, 'clip_ratio/region_mean': 0.0004211952782497974, 'epoch': 0.12}
+
+ 25%|██▍       | 251/1024 [18:40:33<63:47:11, 297.07s/it][AINFO 12-02 20:22:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:22:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:22:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:22:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 252/1024 [18:45:07<62:15:35, 290.33s/it][A
+                                                         [A{'loss': 0.0374, 'grad_norm': 0.0020246391650289297, 'learning_rate': 1e-05, 'num_tokens': 89934819.0, 'completions/mean_length': 3534.8125, 'completions/min_length': 676.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 2902.885009765625, 'completions/min_terminated_length': 676.0, 'completions/max_terminated_length': 13897.0, 'rewards/accuracy_reward/mean': 0.765625, 'rewards/accuracy_reward/std': 0.42695629596710205, 'reward': 0.765625, 'reward_std': 0.38664889335632324, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.014003757387399673, 'sampling/sampling_logp_difference/max': 1.9525107145309448, 'sampling/importance_sampling_ratio/min': 0.14191730320453644, 'sampling/importance_sampling_ratio/mean': 1.0000638961791992, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.36453455314040184, 'clip_ratio/low_mean': 0.0001508109121459711, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.064363350968051e-05, 'clip_ratio/high_max': 0.0002627423518788419, 'clip_ratio/region_mean': 0.00023145454269979382, 'epoch': 0.12}
+
+ 25%|██▍       | 252/1024 [18:45:07<62:15:35, 290.33s/it][AINFO 12-02 20:27:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:27:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 253/1024 [18:49:45<61:22:57, 286.61s/it][A
+                                                         [A{'loss': 0.0494, 'grad_norm': 0.003098095301538706, 'learning_rate': 1e-05, 'num_tokens': 90318151.0, 'completions/mean_length': 5845.0625, 'completions/min_length': 548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5326.75390625, 'completions/min_terminated_length': 548.0, 'completions/max_terminated_length': 15887.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.3661494255065918, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01603994145989418, 'sampling/sampling_logp_difference/max': 4.4999518394470215, 'sampling/importance_sampling_ratio/min': 0.011109532788395882, 'sampling/importance_sampling_ratio/mean': 1.000133752822876, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.40485677868127823, 'clip_ratio/low_mean': 0.0003159538173349574, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.4513947614177596e-05, 'clip_ratio/high_max': 0.00011425786578911357, 'clip_ratio/region_mean': 0.0003604677658586297, 'epoch': 0.12}
+
+ 25%|██▍       | 253/1024 [18:49:45<61:22:57, 286.61s/it][AINFO 12-02 20:31:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:31:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:31:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:31:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 254/1024 [18:55:38<65:34:04, 306.55s/it][A
+                                                         [A{'loss': 0.027, 'grad_norm': 0.002317545237019658, 'learning_rate': 1e-05, 'num_tokens': 90927798.0, 'completions/mean_length': 9362.734375, 'completions/min_length': 967.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.21875, 'completions/mean_terminated_length': 7396.77978515625, 'completions/min_terminated_length': 967.0, 'completions/max_terminated_length': 16242.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.40625, 'reward_std': 0.40715813636779785, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019569456577301025, 'sampling/sampling_logp_difference/max': 4.698964595794678, 'sampling/importance_sampling_ratio/min': 0.009104698896408081, 'sampling/importance_sampling_ratio/mean': 1.0000003576278687, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5019086040556431, 'clip_ratio/low_mean': 0.00042032813962578075, 'clip_ratio/low_min': 1.704855458228849e-05, 'clip_ratio/high_mean': 8.711725513421698e-05, 'clip_ratio/high_max': 0.0002492800213076407, 'clip_ratio/region_mean': 0.0005074453911220189, 'epoch': 0.12}
+
+ 25%|██▍       | 254/1024 [18:55:38<65:34:04, 306.55s/it][AINFO 12-02 20:37:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:37:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:37:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:37:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 255/1024 [19:00:04<62:51:22, 294.26s/it][A
+                                                         [A{'loss': 0.0496, 'grad_norm': 0.0019666694570332766, 'learning_rate': 1e-05, 'num_tokens': 91297622.0, 'completions/mean_length': 5600.25, 'completions/min_length': 738.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5252.38671875, 'completions/min_terminated_length': 738.0, 'completions/max_terminated_length': 15054.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.26409146189689636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017555218189954758, 'sampling/sampling_logp_difference/max': 1.4214391708374023, 'sampling/importance_sampling_ratio/min': 0.2413664013147354, 'sampling/importance_sampling_ratio/mean': 0.9999500513076782, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5131070017814636, 'clip_ratio/low_mean': 0.0002540413888709736, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.716160995281825e-05, 'clip_ratio/high_max': 0.0001281084669244592, 'clip_ratio/region_mean': 0.0002912030031438917, 'epoch': 0.12}
+
+ 25%|██▍       | 255/1024 [19:00:04<62:51:22, 294.26s/it][AINFO 12-02 20:42:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:42:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:42:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:42:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 256/1024 [19:04:30<60:59:52, 285.93s/it][A
+                                                         [A{'loss': -0.0095, 'grad_norm': 0.0020149960182607174, 'learning_rate': 1e-05, 'num_tokens': 91700399.0, 'completions/mean_length': 6148.265625, 'completions/min_length': 131.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5985.7939453125, 'completions/min_terminated_length': 131.0, 'completions/max_terminated_length': 15713.0, 'rewards/accuracy_reward/mean': 0.765625, 'rewards/accuracy_reward/std': 0.42695629596710205, 'reward': 0.765625, 'reward_std': 0.38664886355400085, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019084926694631577, 'sampling/sampling_logp_difference/max': 3.370089292526245, 'sampling/importance_sampling_ratio/min': 0.20178046822547913, 'sampling/importance_sampling_ratio/mean': 1.0000580549240112, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4403763748705387, 'clip_ratio/low_mean': 0.0001435444401067798, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.00010443367000334547, 'clip_ratio/high_max': 0.00023221780247695278, 'clip_ratio/region_mean': 0.000247978112383862, 'epoch': 0.12}
+
+ 25%|██▌       | 256/1024 [19:04:30<60:59:52, 285.93s/it][AINFO 12-02 20:46:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:46:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:46:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:46:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 257/1024 [19:09:08<60:24:50, 283.56s/it][A
+                                                         [A{'loss': 0.0441, 'grad_norm': 0.0025008381344377995, 'learning_rate': 1e-05, 'num_tokens': 92054517.0, 'completions/mean_length': 5388.09375, 'completions/min_length': 1526.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4847.31103515625, 'completions/min_terminated_length': 1526.0, 'completions/max_terminated_length': 15324.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.4836103618144989, 'reward': 0.640625, 'reward_std': 0.3140517771244049, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019893258810043335, 'sampling/sampling_logp_difference/max': 3.9410645961761475, 'sampling/importance_sampling_ratio/min': 0.019427521154284477, 'sampling/importance_sampling_ratio/mean': 1.0001994371414185, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5359686389565468, 'clip_ratio/low_mean': 0.00030131601306493394, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.26411488156009e-05, 'clip_ratio/high_max': 0.00014185871441441122, 'clip_ratio/region_mean': 0.0003639571586973034, 'epoch': 0.12}
+
+ 25%|██▌       | 257/1024 [19:09:08<60:24:50, 283.56s/it][AINFO 12-02 20:51:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:51:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 258/1024 [19:13:44<59:49:02, 281.13s/it][A
+                                                         [A{'loss': -0.026, 'grad_norm': 0.0013790081720799208, 'learning_rate': 1e-05, 'num_tokens': 92540362.0, 'completions/mean_length': 7388.828125, 'completions/min_length': 1067.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7246.0478515625, 'completions/min_terminated_length': 1067.0, 'completions/max_terminated_length': 14044.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5029674172401428, 'reward': 0.53125, 'reward_std': 0.19727617502212524, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021587181836366653, 'sampling/sampling_logp_difference/max': 8.385900497436523, 'sampling/importance_sampling_ratio/min': 0.00022806029301136732, 'sampling/importance_sampling_ratio/mean': 1.0000412464141846, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7122420892119408, 'clip_ratio/low_mean': 0.0001150462485384196, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.1538489085723995e-05, 'clip_ratio/high_max': 0.00011678758528432809, 'clip_ratio/region_mean': 0.00015658474080737506, 'epoch': 0.12}
+
+ 25%|██▌       | 258/1024 [19:13:44<59:49:02, 281.13s/it][AINFO 12-02 20:55:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:55:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:55:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 20:55:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 259/1024 [19:19:28<63:46:32, 300.12s/it][A
+                                                         [A{'loss': -0.0363, 'grad_norm': 0.001344260643236339, 'learning_rate': 1e-05, 'num_tokens': 93079559.0, 'completions/mean_length': 8248.078125, 'completions/min_length': 1462.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 7085.8037109375, 'completions/min_terminated_length': 1462.0, 'completions/max_terminated_length': 14736.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.19044628739356995, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017266741022467613, 'sampling/sampling_logp_difference/max': 9.292439460754395, 'sampling/importance_sampling_ratio/min': 9.211807628162205e-05, 'sampling/importance_sampling_ratio/mean': 0.9998832941055298, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.47083580493927, 'clip_ratio/low_mean': 0.00011602612175920513, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.853472207993036e-05, 'clip_ratio/high_max': 0.0001371676135022426, 'clip_ratio/region_mean': 0.00015456084292964078, 'epoch': 0.12}
+
+ 25%|██▌       | 259/1024 [19:19:28<63:46:32, 300.12s/it][AINFO 12-02 21:01:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:01:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:01:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:01:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 260/1024 [19:24:30<63:49:30, 300.75s/it][A
+                                                         [A{'loss': -0.0092, 'grad_norm': 0.0009925465565174818, 'learning_rate': 1e-05, 'num_tokens': 93458977.0, 'completions/mean_length': 5764.78125, 'completions/min_length': 317.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5422.2255859375, 'completions/min_terminated_length': 317.0, 'completions/max_terminated_length': 16282.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.25513991713523865, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0166617464274168, 'sampling/sampling_logp_difference/max': 1.7251324653625488, 'sampling/importance_sampling_ratio/min': 0.17814946174621582, 'sampling/importance_sampling_ratio/mean': 0.9999211430549622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.46327096968889236, 'clip_ratio/low_mean': 6.363988757129846e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4952666788303759e-05, 'clip_ratio/high_max': 5.9810667153215036e-05, 'clip_ratio/region_mean': 7.85925549280364e-05, 'epoch': 0.12}
+
+ 25%|██▌       | 260/1024 [19:24:30<63:49:30, 300.75s/it][AINFO 12-02 21:06:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:06:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:06:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:06:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▌       | 261/1024 [19:29:48<64:49:45, 305.88s/it][A
+                                                         [A{'loss': 0.1772, 'grad_norm': 0.0017945750150829554, 'learning_rate': 1e-05, 'num_tokens': 93893104.0, 'completions/mean_length': 6652.234375, 'completions/min_length': 541.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6003.4501953125, 'completions/min_terminated_length': 541.0, 'completions/max_terminated_length': 15978.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.48795005679130554, 'reward': 0.625, 'reward_std': 0.3866586685180664, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01776384748518467, 'sampling/sampling_logp_difference/max': 8.68387222290039, 'sampling/importance_sampling_ratio/min': 0.00016929424600675702, 'sampling/importance_sampling_ratio/mean': 1.0000324249267578, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4734529182314873, 'clip_ratio/low_mean': 0.0002939883343060501, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.400597072162782e-05, 'clip_ratio/high_max': 0.0001306902959186118, 'clip_ratio/region_mean': 0.00033799430093495175, 'epoch': 0.12}
+
+ 25%|██▌       | 261/1024 [19:29:48<64:49:45, 305.88s/it][AINFO 12-02 21:11:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:11:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:11:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:11:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 262/1024 [19:34:32<63:19:48, 299.20s/it][A
+                                                         [A{'loss': 0.1638, 'grad_norm': 0.0036523109301924706, 'learning_rate': 1e-05, 'num_tokens': 94235841.0, 'completions/mean_length': 5219.265625, 'completions/min_length': 806.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 4273.1015625, 'completions/min_terminated_length': 806.0, 'completions/max_terminated_length': 15057.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.49776285886764526, 'reward': 0.578125, 'reward_std': 0.3934885859489441, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020383019000291824, 'sampling/sampling_logp_difference/max': 1.5323386192321777, 'sampling/importance_sampling_ratio/min': 0.21602988243103027, 'sampling/importance_sampling_ratio/mean': 1.0000405311584473, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5115125477313995, 'clip_ratio/low_mean': 0.0004574676713673398, 'clip_ratio/low_min': 0.00010720872160163708, 'clip_ratio/high_mean': 3.610977285006811e-05, 'clip_ratio/high_max': 8.7958029780566e-05, 'clip_ratio/region_mean': 0.0004935774386467529, 'epoch': 0.12}
+
+ 26%|██▌       | 262/1024 [19:34:32<63:19:48, 299.20s/it][AINFO 12-02 21:16:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:16:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 263/1024 [19:39:12<62:03:49, 293.60s/it][A
+                                                         [A{'loss': 0.0021, 'grad_norm': 0.0005074681248515844, 'learning_rate': 1e-05, 'num_tokens': 94659043.0, 'completions/mean_length': 6469.03125, 'completions/min_length': 344.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5808.03369140625, 'completions/min_terminated_length': 344.0, 'completions/max_terminated_length': 15423.0, 'rewards/accuracy_reward/mean': 0.8125, 'rewards/accuracy_reward/std': 0.39339789748191833, 'reward': 0.8125, 'reward_std': 0.22461533546447754, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01675971783697605, 'sampling/sampling_logp_difference/max': 14.171942710876465, 'sampling/importance_sampling_ratio/min': 7.001700055297988e-07, 'sampling/importance_sampling_ratio/mean': 1.000086784362793, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4062219373881817, 'clip_ratio/low_mean': 0.00016847672441144823, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.898953056908795e-05, 'clip_ratio/high_max': 0.0001519577781436965, 'clip_ratio/region_mean': 0.00021746625225205207, 'epoch': 0.12}
+
+ 26%|██▌       | 263/1024 [19:39:12<62:03:49, 293.60s/it][AINFO 12-02 21:21:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:21:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:21:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:21:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 264/1024 [19:42:05<54:20:22, 257.40s/it][A
+                                                         [A{'loss': -0.0894, 'grad_norm': 0.0017048402223736048, 'learning_rate': 1e-05, 'num_tokens': 94959356.0, 'completions/mean_length': 4474.015625, 'completions/min_length': 981.0, 'completions/max_length': 11015.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4474.015625, 'completions/min_terminated_length': 981.0, 'completions/max_terminated_length': 11015.0, 'rewards/accuracy_reward/mean': 0.875, 'rewards/accuracy_reward/std': 0.3333333432674408, 'reward': 0.875, 'reward_std': 0.1825428307056427, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01564747840166092, 'sampling/sampling_logp_difference/max': 1.5912723541259766, 'sampling/importance_sampling_ratio/min': 0.2036662995815277, 'sampling/importance_sampling_ratio/mean': 0.9999028444290161, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4505701772868633, 'clip_ratio/low_mean': 1.6859663446666673e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.354214297563885e-05, 'clip_ratio/high_max': 0.0001741685719025554, 'clip_ratio/region_mean': 6.0401806422305526e-05, 'epoch': 0.12}
+
+ 26%|██▌       | 264/1024 [19:42:05<54:20:22, 257.40s/it][AINFO 12-02 21:24:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:24:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:24:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:24:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 265/1024 [19:46:35<55:02:49, 261.09s/it][A
+                                                         [A{'loss': -0.0758, 'grad_norm': 0.001498171011917293, 'learning_rate': 1e-05, 'num_tokens': 95303228.0, 'completions/mean_length': 5231.375, 'completions/min_length': 537.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 4682.884765625, 'completions/min_terminated_length': 537.0, 'completions/max_terminated_length': 14001.0, 'rewards/accuracy_reward/mean': 0.734375, 'rewards/accuracy_reward/std': 0.44515693187713623, 'reward': 0.734375, 'reward_std': 0.25726157426834106, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01918056607246399, 'sampling/sampling_logp_difference/max': 1.8484957218170166, 'sampling/importance_sampling_ratio/min': 0.31582701206207275, 'sampling/importance_sampling_ratio/mean': 0.999983549118042, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.49470194056630135, 'clip_ratio/low_mean': 4.756956059281947e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.8644840489942e-05, 'clip_ratio/high_max': 0.0001360088235742296, 'clip_ratio/region_mean': 9.621439858165104e-05, 'epoch': 0.12}
+
+ 26%|██▌       | 265/1024 [19:46:35<55:02:49, 261.09s/it][AINFO 12-02 21:28:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:28:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 266/1024 [19:50:58<55:03:31, 261.49s/it][A
+                                                         [A{'loss': 0.0326, 'grad_norm': 0.004479626193642616, 'learning_rate': 1e-05, 'num_tokens': 95650277.0, 'completions/mean_length': 5280.140625, 'completions/min_length': 1043.0, 'completions/max_length': 15359.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5280.140625, 'completions/min_terminated_length': 1043.0, 'completions/max_terminated_length': 15359.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.4787135720252991, 'reward': 0.65625, 'reward_std': 0.3377464711666107, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.015082387253642082, 'sampling/sampling_logp_difference/max': 1.8780837059020996, 'sampling/importance_sampling_ratio/min': 0.15288279950618744, 'sampling/importance_sampling_ratio/mean': 0.9998706579208374, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.3957788422703743, 'clip_ratio/low_mean': 0.0001895098680506635, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.212961625351454e-05, 'clip_ratio/high_max': 0.00018428559178573778, 'clip_ratio/region_mean': 0.00025163948703266215, 'epoch': 0.12}
+
+ 26%|██▌       | 266/1024 [19:50:58<55:03:31, 261.49s/it][AINFO 12-02 21:33:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:33:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 267/1024 [19:56:07<58:02:12, 276.00s/it][A
+                                                         [A{'loss': -0.0467, 'grad_norm': 0.003040217561647296, 'learning_rate': 1e-05, 'num_tokens': 96080034.0, 'completions/mean_length': 6573.328125, 'completions/min_length': 1279.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6256.8544921875, 'completions/min_terminated_length': 1279.0, 'completions/max_terminated_length': 16109.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.49501484632492065, 'reward': 0.59375, 'reward_std': 0.40139204263687134, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.018247518688440323, 'sampling/sampling_logp_difference/max': 7.132770538330078, 'sampling/importance_sampling_ratio/min': 0.000798504042904824, 'sampling/importance_sampling_ratio/mean': 1.0000135898590088, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.5436258502304554, 'clip_ratio/low_mean': 0.00027653996949084103, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.575452036689967e-05, 'clip_ratio/high_max': 0.00023960355247254483, 'clip_ratio/region_mean': 0.0003522944825817831, 'epoch': 0.12}
+
+ 26%|██▌       | 267/1024 [19:56:07<58:02:12, 276.00s/it][AINFO 12-02 21:38:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:38:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:38:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:38:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 268/1024 [20:01:16<60:02:18, 285.90s/it][A
+                                                         [A{'loss': -0.069, 'grad_norm': 0.0008179154247045517, 'learning_rate': 1e-05, 'num_tokens': 96411562.0, 'completions/mean_length': 5036.75, 'completions/min_length': 181.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 3862.896484375, 'completions/min_terminated_length': 181.0, 'completions/max_terminated_length': 15049.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.5, 'reward': 0.5625, 'reward_std': 0.32195523381233215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.015764065086841583, 'sampling/sampling_logp_difference/max': 1.5857148170471191, 'sampling/importance_sampling_ratio/min': 0.20480135083198547, 'sampling/importance_sampling_ratio/mean': 0.9999369382858276, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4645473510026932, 'clip_ratio/low_mean': 4.7488576456089504e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4606441911600996e-05, 'clip_ratio/high_max': 9.842576764640398e-05, 'clip_ratio/region_mean': 7.209501882243785e-05, 'epoch': 0.12}
+
+ 26%|██▌       | 268/1024 [20:01:16<60:02:18, 285.90s/it][AINFO 12-02 21:43:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:43:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:43:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:43:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 269/1024 [20:05:37<58:20:59, 278.22s/it][A
+                                                         [A{'loss': 0.0798, 'grad_norm': 0.001968186115846038, 'learning_rate': 1e-05, 'num_tokens': 96739134.0, 'completions/mean_length': 4951.4375, 'completions/min_length': 617.0, 'completions/max_length': 16190.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4951.4375, 'completions/min_terminated_length': 617.0, 'completions/max_terminated_length': 16190.0, 'rewards/accuracy_reward/mean': 0.71875, 'rewards/accuracy_reward/std': 0.4531635046005249, 'reward': 0.71875, 'reward_std': 0.2756393849849701, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015898514539003372, 'sampling/sampling_logp_difference/max': 1.2493343353271484, 'sampling/importance_sampling_ratio/min': 0.3041094243526459, 'sampling/importance_sampling_ratio/mean': 1.000047206878662, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.42127126455307007, 'clip_ratio/low_mean': 8.5288150330598e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.41143734835714e-05, 'clip_ratio/high_max': 0.00016850479323693435, 'clip_ratio/region_mean': 0.00013940252392785624, 'epoch': 0.12}
+
+ 26%|██▋       | 269/1024 [20:05:37<58:20:59, 278.22s/it][AINFO 12-02 21:47:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:47:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:47:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:47:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 270/1024 [20:09:52<56:51:18, 271.46s/it][A
+                                                         [A{'loss': 0.0467, 'grad_norm': 0.002243818948045373, 'learning_rate': 1e-05, 'num_tokens': 97107255.0, 'completions/mean_length': 5599.515625, 'completions/min_length': 619.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5251.62890625, 'completions/min_terminated_length': 619.0, 'completions/max_terminated_length': 13966.0, 'rewards/accuracy_reward/mean': 0.84375, 'rewards/accuracy_reward/std': 0.36596253514289856, 'reward': 0.84375, 'reward_std': 0.2756393849849701, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01510545052587986, 'sampling/sampling_logp_difference/max': 2.7081260681152344, 'sampling/importance_sampling_ratio/min': 0.06666161119937897, 'sampling/importance_sampling_ratio/mean': 1.0000274181365967, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.38387196511030197, 'clip_ratio/low_mean': 0.00011216796872304258, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6863826633416465e-05, 'clip_ratio/high_max': 9.99491503534955e-05, 'clip_ratio/region_mean': 0.0001390317952427722, 'epoch': 0.12}
+
+ 26%|██▋       | 270/1024 [20:09:52<56:51:18, 271.46s/it][AINFO 12-02 21:51:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:51:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:51:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:51:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▋       | 271/1024 [20:12:34<49:54:13, 238.58s/it][A
+                                                         [A{'loss': -0.1001, 'grad_norm': 0.0018381833797320724, 'learning_rate': 1e-05, 'num_tokens': 97315332.0, 'completions/mean_length': 3127.578125, 'completions/min_length': 381.0, 'completions/max_length': 10252.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3127.578125, 'completions/min_terminated_length': 381.0, 'completions/max_terminated_length': 10252.0, 'rewards/accuracy_reward/mean': 0.75, 'rewards/accuracy_reward/std': 0.4364357888698578, 'reward': 0.75, 'reward_std': 0.2177756428718567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01425042375922203, 'sampling/sampling_logp_difference/max': 1.1817936897277832, 'sampling/importance_sampling_ratio/min': 0.3067280650138855, 'sampling/importance_sampling_ratio/mean': 1.000335693359375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.38662879914045334, 'clip_ratio/low_mean': 7.85356605774723e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.777219783136388e-05, 'clip_ratio/high_max': 0.00016277498434646986, 'clip_ratio/region_mean': 0.00012630785840883618, 'epoch': 0.12}
+
+ 26%|██▋       | 271/1024 [20:12:34<49:54:13, 238.58s/it][AINFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:54:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 272/1024 [20:17:08<52:03:57, 249.25s/it][A
+                                                         [A{'loss': 0.0703, 'grad_norm': 0.0011525214649736881, 'learning_rate': 1e-05, 'num_tokens': 97647506.0, 'completions/mean_length': 5032.96875, 'completions/min_length': 684.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4852.7939453125, 'completions/min_terminated_length': 684.0, 'completions/max_terminated_length': 16146.0, 'rewards/accuracy_reward/mean': 0.78125, 'rewards/accuracy_reward/std': 0.4166666865348816, 'reward': 0.78125, 'reward_std': 0.10888782143592834, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.016339745372533798, 'sampling/sampling_logp_difference/max': 3.820802688598633, 'sampling/importance_sampling_ratio/min': 0.021910205483436584, 'sampling/importance_sampling_ratio/mean': 0.999967098236084, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.45733584463596344, 'clip_ratio/low_mean': 5.219016566115897e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.219016566115897e-05, 'epoch': 0.13}
+
+ 27%|██▋       | 272/1024 [20:17:08<52:03:57, 249.25s/it][AINFO 12-02 21:59:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:59:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:59:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 21:59:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 273/1024 [20:21:00<50:53:24, 243.95s/it][A
+                                                         [A{'loss': -0.0155, 'grad_norm': 0.0035352492704987526, 'learning_rate': 1e-05, 'num_tokens': 97930161.0, 'completions/mean_length': 4199.484375, 'completions/min_length': 826.0, 'completions/max_length': 16198.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4199.484375, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 16198.0, 'rewards/accuracy_reward/mean': 0.859375, 'rewards/accuracy_reward/std': 0.3503824472427368, 'reward': 0.859375, 'reward_std': 0.19044627249240875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.015218119136989117, 'sampling/sampling_logp_difference/max': 1.6911556720733643, 'sampling/importance_sampling_ratio/min': 0.18430639803409576, 'sampling/importance_sampling_ratio/mean': 0.9999779462814331, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.4148215837776661, 'clip_ratio/low_mean': 6.990427959863155e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7000283239431155e-05, 'clip_ratio/high_max': 0.00010800113295772462, 'clip_ratio/region_mean': 9.690456249700219e-05, 'epoch': 0.13}
+
+ 27%|██▋       | 273/1024 [20:21:00<50:53:24, 243.95s/it][AINFO 12-02 22:03:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:03:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:03:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 22:03:06 [block_pool.py:292] Successfully reset prefix cache
diff --git a/grpo_lora_20251130_192918/README.md b/grpo_lora_20251130_192918/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1644baed519d6bcd6455dd03ece9b31c30ec0d5d
--- /dev/null
+++ b/grpo_lora_20251130_192918/README.md
@@ -0,0 +1,72 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+model_name: dr_grpo_lora_20251130_192918
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+
+# Model Card for dr_grpo_lora_20251130_192918
+
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8qozoeij) 
+
+
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+
+### Framework versions
+
+- PEFT 0.17.1
+- TRL: 0.25.0
+- Transformers: 4.57.1
+- Pytorch: 2.8.0
+- Datasets: 4.4.1
+- Tokenizers: 0.22.1
+
+## Citations
+
+Cite GRPO as:
+
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+
+```
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/adapter_config.json b/grpo_lora_20251130_192918/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..da884a8eb3c02a42d08fe869da98a8ad4366197d
--- /dev/null
+++ b/grpo_lora_20251130_192918/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/chat_template.jinja b/grpo_lora_20251130_192918/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/grpo_lora_20251130_192918/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/checkpoint-1024/adapter_config.json b/grpo_lora_20251130_192918/checkpoint-1024/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..da884a8eb3c02a42d08fe869da98a8ad4366197d
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-1024/adapter_config.json
@@ -0,0 +1,42 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/checkpoint-1024/chat_template.jinja b/grpo_lora_20251130_192918/checkpoint-1024/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-1024/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/checkpoint-1024/latest b/grpo_lora_20251130_192918/checkpoint-1024/latest
new file mode 100644
index 0000000000000000000000000000000000000000..97fe0c3f1bf7645f1b3a8c4e0727a37322abbea2
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-1024/latest
@@ -0,0 +1 @@
+global_step1024
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/checkpoint-1024/special_tokens_map.json b/grpo_lora_20251130_192918/checkpoint-1024/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-1024/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/grpo_lora_20251130_192918/checkpoint-1024/tokenizer_config.json b/grpo_lora_20251130_192918/checkpoint-1024/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-1024/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
diff --git a/grpo_lora_20251130_192918/checkpoint-1024/zero_to_fp32.py b/grpo_lora_20251130_192918/checkpoint-1024/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-1024/zero_to_fp32.py
@@ -0,0 +1,760 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/grpo_lora_20251130_192918/checkpoint-128/README.md b/grpo_lora_20251130_192918/checkpoint-128/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-128/README.md
@@ -0,0 +1,209 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/checkpoint-128/chat_template.jinja b/grpo_lora_20251130_192918/checkpoint-128/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-128/chat_template.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\n'}}{% endif %}
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/checkpoint-128/latest b/grpo_lora_20251130_192918/checkpoint-128/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b4db7fb020d9ef75e52048bf0cde7481e3ef9351
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-128/latest
@@ -0,0 +1 @@
+global_step128
\ No newline at end of file
diff --git a/grpo_lora_20251130_192918/checkpoint-128/special_tokens_map.json b/grpo_lora_20251130_192918/checkpoint-128/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/grpo_lora_20251130_192918/checkpoint-128/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/grpo_lora_20251130_192918/output.log b/grpo_lora_20251130_192918/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..41e87109158b6707615cd680febbbbdcaad4da05
--- /dev/null
+++ b/grpo_lora_20251130_192918/output.log
@@ -0,0 +1,13922 @@
+W1130 19:29:41.689000 398113 torch/distributed/run.py:774] 
+W1130 19:29:41.689000 398113 torch/distributed/run.py:774] *****************************************
+W1130 19:29:41.689000 398113 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W1130 19:29:41.689000 398113 torch/distributed/run.py:774] *****************************************
+INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda.
+INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda.
+INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda.
+INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda.
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+
+[OpenTinker] 2025-11-30 19:30:09,846 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it
+[OpenTinker] 2025-11-30 19:30:09,846 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it
+[OpenTinker] 2025-11-30 19:30:09,846 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it
+TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))
+[OpenTinker] 2025-11-30 19:30:09,849 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: setting up run 8qozoeij
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-8qozoeij
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dr_grpo_lora_20251130_192918
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8qozoeij
+wandb: setting up run hblruoay
+wandb: setting up run axfzdypj
+wandb: setting up run 56oyy2tp
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-11-30 19:30:16,189 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-11-30 19:30:16,189 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-hblruoay
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dr_grpo_lora_20251130_192918
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/hblruoay
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-56oyy2tp
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dr_grpo_lora_20251130_192918
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/56oyy2tp
+wandb: Tracking run with wandb version 0.22.3
+wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-axfzdypj
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run outputs/dr_grpo_lora_20251130_192918
+wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina
+wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/axfzdypj
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-11-30 19:30:16,546 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-11-30 19:30:16,546 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-11-30 19:30:16,592 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-11-30 19:30:16,592 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+wandb: Detected [huggingface_hub.inference, openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[OpenTinker] 2025-11-30 19:30:16,632 - root - INFO - Wandb initialized successfully
+[OpenTinker] 2025-11-30 19:30:16,632 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-11-30 19:30:17,510 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-11-30 19:30:17,951 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-11-30 19:30:17,981 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-11-30 19:30:17,984 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed
+[OpenTinker] 2025-11-30 19:30:21,260 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-11-30 19:30:21,412 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+[OpenTinker] 2025-11-30 19:30:21,466 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+`torch_dtype` is deprecated! Use `dtype` instead!
+[OpenTinker] 2025-11-30 19:30:21,785 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+`torch_dtype` is deprecated! Use `dtype` instead!
+[OpenTinker] 2025-11-30 19:30:22,855 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-11-30 19:30:22,856 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-11-30 19:30:22,883 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-11-30 19:30:22,884 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-11-30 19:30:22,893 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-11-30 19:30:22,894 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-11-30 19:30:23,079 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-11-30 19:30:23,111 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-11-30 19:30:23,112 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-11-30 19:30:23,177 - root - INFO - Model loaded successfully
+[OpenTinker] 2025-11-30 19:30:23,177 - root - INFO - Detected PEFT configuration, configuring lora
+[OpenTinker] 2025-11-30 19:30:23,403 - root - INFO - Lora configured successfully
+[OpenTinker] 2025-11-30 19:30:23,567 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpjzmx7k6g/test.c -o /tmp/tmpjzmx7k6g/test.o
+[OpenTinker] 2025-11-30 19:30:23,567 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp5gptor5t/test.c -o /tmp/tmp5gptor5t/test.o
+[OpenTinker] 2025-11-30 19:30:23,567 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpx1zh486p/test.c -o /tmp/tmpx1zh486p/test.o
+[OpenTinker] 2025-11-30 19:30:23,594 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpjzmx7k6g/test.o -laio -o /tmp/tmpjzmx7k6g/a.out
+[OpenTinker] 2025-11-30 19:30:23,605 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp5gptor5t/test.o -laio -o /tmp/tmp5gptor5t/a.out
+[OpenTinker] 2025-11-30 19:30:23,605 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpx1zh486p/test.o -laio -o /tmp/tmpx1zh486p/a.out
+[OpenTinker] 2025-11-30 19:30:23,743 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpbwojjnr2/test.c -o /tmp/tmpbwojjnr2/test.o
+[OpenTinker] 2025-11-30 19:30:23,771 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpbwojjnr2/test.o -laio -o /tmp/tmpbwojjnr2/a.out
+[OpenTinker] 2025-11-30 19:30:24,099 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp89odbgc6/test.c -o /tmp/tmp89odbgc6/test.o
+[OpenTinker] 2025-11-30 19:30:24,118 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpq77yq7ci/test.c -o /tmp/tmpq77yq7ci/test.o
+[OpenTinker] 2025-11-30 19:30:24,137 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpfom6ln06/test.c -o /tmp/tmpfom6ln06/test.o
+[OpenTinker] 2025-11-30 19:30:24,149 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp89odbgc6/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp89odbgc6/a.out
+[OpenTinker] 2025-11-30 19:30:24,162 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpq77yq7ci/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpq77yq7ci/a.out
+[OpenTinker] 2025-11-30 19:30:24,175 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpfom6ln06/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpfom6ln06/a.out
+[OpenTinker] 2025-11-30 19:30:24,333 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmph6jzh41r/test.c -o /tmp/tmph6jzh41r/test.o
+[OpenTinker] 2025-11-30 19:30:24,359 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmph6jzh41r/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmph6jzh41r/a.out
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO cudaDriverVersion 12090
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO cudaDriverVersion 12090
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO cudaDriverVersion 12090
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO cudaDriverVersion 12090
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO NCCL version 2.27.3+cuda12.9
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Initialized NET plugin Socket
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Initialized NET plugin Socket
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Initialized NET plugin Socket
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. 
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0>
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Initialized NET plugin Socket
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO ncclCommInitRankConfig comm 0x19032270 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x5f2965acf2c07fe9 - Init START
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO ncclCommInitRankConfig comm 0x191e30a0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x5f2965acf2c07fe9 - Init START
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO ncclCommInitRankConfig comm 0x1a5a5680 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x5f2965acf2c07fe9 - Init START
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO ncclCommInitRankConfig comm 0x18ac50d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x5f2965acf2c07fe9 - Init START
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO RAS client listening socket at ::1<28028>
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Bootstrap timings total 0.000784 (create 0.000019, send 0.000101, recv 0.000107, ring 0.000172, delay 0.000001)
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Bootstrap timings total 0.051831 (create 0.000022, send 0.000091, recv 0.018422, ring 0.000055, delay 0.000001)
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Bootstrap timings total 0.002087 (create 0.000020, send 0.000764, recv 0.000689, ring 0.000199, delay 0.000001)
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Bootstrap timings total 0.033559 (create 0.000024, send 0.000095, recv 0.032242, ring 0.000593, delay 0.000001)
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0.
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO comm 0x191e30a0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO comm 0x19032270 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO comm 0x1a5a5680 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO comm 0x18ac50d0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-e9wz-2:398357:399452 [3] NCCL INFO [Proxy Service] Device 3 CPU core 26
+lshn-qs-e9wz-2:398357:399453 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 32
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-e9wz-2:398355:399454 [1] NCCL INFO [Proxy Service] Device 1 CPU core 120
+lshn-qs-e9wz-2:398355:399455 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 34
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-e9wz-2:398354:399456 [0] NCCL INFO [Proxy Service] Device 0 CPU core 45
+lshn-qs-e9wz-2:398354:399457 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 11
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+lshn-qs-e9wz-2:398356:399458 [2] NCCL INFO [Proxy Service] Device 2 CPU core 111
+lshn-qs-e9wz-2:398356:399459 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 42
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO ncclCommInitRankConfig comm 0x191e30a0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x5f2965acf2c07fe9 - Init COMPLETE
+lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.95 (kernels 0.18, alloc 0.61, bootstrap 0.03, allgathers 0.00, topo 0.05, graphs 0.01, connections 0.05, rest 0.03)
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO ncclCommInitRankConfig comm 0x19032270 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x5f2965acf2c07fe9 - Init COMPLETE
+lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.95 (kernels 0.17, alloc 0.59, bootstrap 0.05, allgathers 0.00, topo 0.05, graphs 0.01, connections 0.04, rest 0.03)
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO ncclCommInitRankConfig comm 0x18ac50d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x5f2965acf2c07fe9 - Init COMPLETE
+lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.89 (kernels 0.19, alloc 0.56, bootstrap 0.00, allgathers 0.01, topo 0.05, graphs 0.01, connections 0.04, rest 0.04)
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO ncclCommInitRankConfig comm 0x1a5a5680 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x5f2965acf2c07fe9 - Init COMPLETE
+lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.90 (kernels 0.16, alloc 0.59, bootstrap 0.00, allgathers 0.00, topo 0.05, graphs 0.01, connections 0.05, rest 0.03)
+[OpenTinker] 2025-11-30 19:30:28,208 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-11-30 19:30:28,212 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-11-30 19:30:28,227 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-11-30 19:30:28,237 - root - INFO - Training model with GRPO
+[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {}
+[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {}
+[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {}
+[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {}
+INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'}
+INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896
+INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896
+INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896
+INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM
+INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896
+INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
+INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
+INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null}
+[rank2]:[W1130 19:30:53.106048450 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+[rank1]:[W1130 19:30:53.127200306 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+[rank3]:[W1130 19:30:53.134440865 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+[rank0]:[W1130 19:30:53.175332231 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO ncclCommSplit comm 0x1b723300 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 1 color 2003953581 key 3- Init START
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO ncclCommSplit comm 0x1a22d200 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 1 color 2003953581 key 1- Init START
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO ncclCommSplit comm 0x1a2a9f20 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 1 color 2003953581 key 0- Init START
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO ncclCommSplit comm 0x1a4553e0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 1 color 2003953581 key 2- Init START
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO comm 0x1a4553e0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO comm 0x1a22d200 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO comm 0x1b723300 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO comm 0x1a2a9f20 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398356:399604 [2] NCCL INFO [Proxy Service] Device 2 CPU core 114
+lshn-qs-e9wz-2:398357:399603 [3] NCCL INFO [Proxy Service] Device 3 CPU core 103
+lshn-qs-e9wz-2:398356:399605 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 115
+lshn-qs-e9wz-2:398357:399606 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 9
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-e9wz-2:398354:399607 [0] NCCL INFO [Proxy Service] Device 0 CPU core 117
+lshn-qs-e9wz-2:398354:399608 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 22
+lshn-qs-e9wz-2:398355:399609 [1] NCCL INFO [Proxy Service] Device 1 CPU core 2
+lshn-qs-e9wz-2:398355:399610 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 25
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO ncclCommSplit comm 0x1a2a9f20 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 1 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO ncclCommSplit comm 0x1a4553e0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 1 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO ncclCommSplit comm 0x1a22d200 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 1 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO ncclCommSplit comm 0x1b723300 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 1 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.08)
+lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.11 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.05)
+lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.05)
+[Gloo] Rank 0 is connected to 3[Gloo] Rank [Gloo] Rank  peer ranks. Expected number of connected peer ranks is : [Gloo] Rank 31 is connected to 2 is connected to 3 is connected to 33 peer ranks. 3 peer ranks. 
+ peer ranks. Expected number of connected peer ranks is : Expected number of connected peer ranks is : 3Expected number of connected peer ranks is : 33
+
+
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO ncclCommSplit comm 0x1a3beb50 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 2 color 59908776 key 0- Init START
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO comm 0x1a3beb50 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398354:399633 [0] NCCL INFO [Proxy Service] Device 0 CPU core 5
+lshn-qs-e9wz-2:398354:399634 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 108
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO ncclCommSplit comm 0x1a3beb50 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 2 color 59908776 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.01)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO ncclCommSplit comm 0x1a3418c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 4 color 440515407 key 0- Init START
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO comm 0x1a3418c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398355:399648 [1] NCCL INFO [Proxy Service] Device 1 CPU core 110
+lshn-qs-e9wz-2:398355:399649 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 107
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO ncclCommSplit comm 0x1a3418c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 4 color 440515407 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO ncclCommSplit comm 0x1a55cff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 6 color 1227022723 key 0- Init START
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO comm 0x1a55cff0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398356:399663 [2] NCCL INFO [Proxy Service] Device 2 CPU core 24
+lshn-qs-e9wz-2:398356:399664 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 36
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO ncclCommSplit comm 0x1a55cff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 6 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.05 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO ncclCommSplit comm 0x1b837da0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 8 color 1301067556 key 0- Init START
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO comm 0x1b837da0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398357:399680 [3] NCCL INFO [Proxy Service] Device 3 CPU core 113
+lshn-qs-e9wz-2:398357:399681 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 41
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO ncclCommSplit comm 0x1b837da0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 8 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.06, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO ncclCommSplit comm 0x1bb1b890 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 9 color 59908776 key 0- Init START
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO comm 0x1bb1b890 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398354:399689 [0] NCCL INFO [Proxy Service] Device 0 CPU core 98
+lshn-qs-e9wz-2:398354:399690 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 104
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO ncclCommSplit comm 0x1bb1b890 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 9 color 59908776 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.12 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.09)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO ncclCommSplit comm 0x1ba8b5c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 11 color 440515407 key 0- Init START
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO comm 0x1ba8b5c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398355:399704 [1] NCCL INFO [Proxy Service] Device 1 CPU core 16
+lshn-qs-e9wz-2:398355:399705 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 115
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO ncclCommSplit comm 0x1ba8b5c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 11 color 440515407 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO ncclCommSplit comm 0x1bc4c240 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 13 color 1227022723 key 0- Init START
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO comm 0x1bc4c240 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398356:399719 [2] NCCL INFO [Proxy Service] Device 2 CPU core 40
+lshn-qs-e9wz-2:398356:399720 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 121
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO ncclCommSplit comm 0x1bc4c240 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 13 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO ncclCommSplit comm 0x1cf940f0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 15 color 1301067556 key 0- Init START
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO comm 0x1cf940f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398357:399736 [3] NCCL INFO [Proxy Service] Device 3 CPU core 143
+lshn-qs-e9wz-2:398357:399737 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 9
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO ncclCommSplit comm 0x1cf940f0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 15 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO ncclCommSplit comm 0x1bc234a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 16 color 59908776 key 0- Init START
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO comm 0x1bc234a0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398354:399745 [0] NCCL INFO [Proxy Service] Device 0 CPU core 13
+lshn-qs-e9wz-2:398354:399746 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 15
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO ncclCommSplit comm 0x1bc234a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 16 color 59908776 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO ncclCommSplit comm 0x1bb931d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 18 color 440515407 key 0- Init START
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO comm 0x1bb931d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398355:399760 [1] NCCL INFO [Proxy Service] Device 1 CPU core 139
+lshn-qs-e9wz-2:398355:399761 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 113
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO ncclCommSplit comm 0x1bb931d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 18 color 440515407 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO ncclCommSplit comm 0x1bd53e50 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 20 color 1227022723 key 0- Init START
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO comm 0x1bd53e50 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398356:399775 [2] NCCL INFO [Proxy Service] Device 2 CPU core 0
+lshn-qs-e9wz-2:398356:399776 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 114
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO ncclCommSplit comm 0x1bd53e50 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 20 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO ncclCommSplit comm 0x1d09bd00 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 22 color 1301067556 key 0- Init START
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO comm 0x1d09bd00 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398357:399792 [3] NCCL INFO [Proxy Service] Device 3 CPU core 101
+lshn-qs-e9wz-2:398357:399793 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 103
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO ncclCommSplit comm 0x1d09bd00 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 22 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO ncclCommSplit comm 0x1bd2b0b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 23 color 59908776 key 0- Init START
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO comm 0x1bd2b0b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398354:399801 [0] NCCL INFO [Proxy Service] Device 0 CPU core 16
+lshn-qs-e9wz-2:398354:399802 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 4
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO ncclCommSplit comm 0x1bd2b0b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 23 color 59908776 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.04)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO ncclCommSplit comm 0x1bc9ade0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 25 color 440515407 key 0- Init START
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO comm 0x1bc9ade0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398355:399816 [1] NCCL INFO [Proxy Service] Device 1 CPU core 40
+lshn-qs-e9wz-2:398355:399817 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 25
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO ncclCommSplit comm 0x1bc9ade0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 25 color 440515407 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO ncclCommSplit comm 0x1be5ba60 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 27 color 1227022723 key 0- Init START
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO comm 0x1be5ba60 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398356:399831 [2] NCCL INFO [Proxy Service] Device 2 CPU core 110
+lshn-qs-e9wz-2:398356:399832 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 104
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO ncclCommSplit comm 0x1be5ba60 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 27 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO ncclCommSplit comm 0x1d1a3910 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 29 color 1301067556 key 0- Init START
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO comm 0x1d1a3910 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398357:399848 [3] NCCL INFO [Proxy Service] Device 3 CPU core 6
+lshn-qs-e9wz-2:398357:399849 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 9
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO ncclCommSplit comm 0x1d1a3910 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 29 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.07, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO ncclCommSplit comm 0x1be32cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 30 color 59908776 key 0- Init START
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO comm 0x1be32cc0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398354:399857 [0] NCCL INFO [Proxy Service] Device 0 CPU core 117
+lshn-qs-e9wz-2:398354:399858 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 107
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO ncclCommSplit comm 0x1be32cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 30 color 59908776 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.10)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO ncclCommSplit comm 0x1bda29f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 32 color 440515407 key 0- Init START
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO comm 0x1bda29f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398355:399872 [1] NCCL INFO [Proxy Service] Device 1 CPU core 13
+lshn-qs-e9wz-2:398355:399873 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 15
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO ncclCommSplit comm 0x1bda29f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 32 color 440515407 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO ncclCommSplit comm 0x1bf63670 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 34 color 1227022723 key 0- Init START
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO comm 0x1bf63670 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398356:399887 [2] NCCL INFO [Proxy Service] Device 2 CPU core 5
+lshn-qs-e9wz-2:398356:399888 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 7
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO ncclCommSplit comm 0x1bf63670 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 34 color 1227022723 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Using network Socket
+INFO 11-30 19:30:54 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 11-30 19:30:54 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO ncclCommSplit comm 0x1d2ab520 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 36 color 1301067556 key 0- Init START
+INFO 11-30 19:30:54 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO comm 0x1d2ab520 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 00/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 01/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 02/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 03/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 04/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 05/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 06/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 07/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 08/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 09/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 10/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 11/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 12/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 13/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 14/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 15/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 16/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 17/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 18/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 19/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 20/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 21/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 22/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 23/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 24/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 25/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 26/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 27/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 28/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 29/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 30/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 31/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 32/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 33/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 34/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 35/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 36/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 37/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 38/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 39/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 40/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 41/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 42/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 43/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 44/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 45/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 46/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 47/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 48/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 49/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 50/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 51/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 52/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 53/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 54/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 55/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 56/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 57/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 58/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 59/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 60/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 61/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 62/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 63/64 : 0
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+lshn-qs-e9wz-2:398357:399899 [3] NCCL INFO [Proxy Service] Device 3 CPU core 45
+lshn-qs-e9wz-2:398357:399900 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 108
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO ncclCommSplit comm 0x1d2ab520 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 36 color 1301067556 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+INFO 11-30 19:30:54 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
+INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch...
+INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine.
+INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors']
+INFO 11-30 19:30:56 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+[AINFO 11-30 19:30:57 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 11-30 19:30:58 [weight_utils.py:369] Time spent downloading weights for deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B: 0.868513 seconds
+INFO 11-30 19:30:58 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+INFO 11-30 19:30:59 [weight_utils.py:406] No model.safetensors.index.json found in remote.
+
+Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00,  3.00s/it]
+[ALoading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00,  3.00s/it]
+
+INFO 11-30 19:30:59 [default_loader.py:268] Loading weights took 2.39 seconds
+INFO 11-30 19:30:59 [default_loader.py:268] Loading weights took 3.29 seconds
+INFO 11-30 19:30:59 [default_loader.py:268] Loading weights took 1.52 seconds
+INFO 11-30 19:30:59 [default_loader.py:268] Loading weights took 0.79 seconds
+INFO 11-30 19:31:00 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 4.363175 seconds
+INFO 11-30 19:31:00 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 4.621696 seconds
+INFO 11-30 19:31:00 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 4.584446 seconds
+INFO 11-30 19:31:00 [gpu_model_runner.py:2392] Model loading took 3.3461 GiB and 4.661069 seconds
+INFO 11-30 19:31:06 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_3_0/backbone for vLLM's torch.compile
+INFO 11-30 19:31:06 [backends.py:550] Dynamo bytecode transform time: 5.82 s
+INFO 11-30 19:31:06 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_2_0/backbone for vLLM's torch.compile
+INFO 11-30 19:31:06 [backends.py:550] Dynamo bytecode transform time: 5.62 s
+INFO 11-30 19:31:06 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_0_0/backbone for vLLM's torch.compile
+INFO 11-30 19:31:06 [backends.py:550] Dynamo bytecode transform time: 5.57 s
+INFO 11-30 19:31:06 [backends.py:539] Using cache directory: /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/.cache/vllm/torch_compile_cache/63109e049c/rank_1_0/backbone for vLLM's torch.compile
+INFO 11-30 19:31:06 [backends.py:550] Dynamo bytecode transform time: 5.66 s
+INFO 11-30 19:31:09 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.914 s
+INFO 11-30 19:31:09 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.968 s
+INFO 11-30 19:31:09 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.941 s
+INFO 11-30 19:31:09 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.864 s
+INFO 11-30 19:31:09 [monitor.py:34] torch.compile takes 5.82 s in total
+INFO 11-30 19:31:09 [monitor.py:34] torch.compile takes 5.62 s in total
+INFO 11-30 19:31:09 [monitor.py:34] torch.compile takes 5.57 s in total
+INFO 11-30 19:31:09 [monitor.py:34] torch.compile takes 5.66 s in total
+INFO 11-30 19:31:10 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 11-30 19:31:10 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 11-30 19:31:10 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 11-30 19:31:10 [gpu_worker.py:298] Available KV cache memory: 50.14 GiB
+INFO 11-30 19:31:11 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 11-30 19:31:11 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 11-30 19:31:11 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 11-30 19:31:11 [kv_cache_utils.py:864] GPU KV cache size: 1,877,584 tokens
+INFO 11-30 19:31:11 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 11-30 19:31:11 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 11-30 19:31:11 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+INFO 11-30 19:31:11 [kv_cache_utils.py:868] Maximum concurrency for 16,896 tokens per request: 111.13x
+
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/11 [00:00<?, ?it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  27%|██▋       | 3/11 [00:00<00:00, 29.15it/s][A
+Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  64%|██████▎   | 7/11 [00:00<00:00, 34.08it/s][ACapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 11/11 [00:00<00:00, 35.52it/s]
+INFO 11-30 19:31:12 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 11-30 19:31:12 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 11-30 19:31:12 [core.py:218] init engine (profile, create kv cache, warmup model) took 11.77 seconds
+INFO 11-30 19:31:12 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 11-30 19:31:12 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 11-30 19:31:12 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 11-30 19:31:12 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 11-30 19:31:12 [core.py:218] init engine (profile, create kv cache, warmup model) took 11.84 seconds
+INFO 11-30 19:31:12 [core.py:218] init engine (profile, create kv cache, warmup model) took 11.94 seconds
+INFO 11-30 19:31:12 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.16 GiB
+INFO 11-30 19:31:12 [gpu_worker.py:391] Free memory on device (139.08/139.81 GiB) on startup. Desired GPU memory utilization is (0.4, 55.92 GiB). Actual usage is 3.35 GiB for weight, 0.28 GiB for peak activation, 2.16 GiB for non-torch memory, and 0.16 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=53509203353` to fit into requested memory, or `--kv-cache-memory=142796721152` to fully utilize gpu memory. Current kv cache memory in use is 53834261913 bytes.
+INFO 11-30 19:31:12 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.41 seconds
+INFO 11-30 19:31:13 [llm.py:295] Supported_tasks: ('generate',)
+INFO 11-30 19:31:13 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
+INFO 11-30 19:31:13 [llm.py:295] Supported_tasks: ('generate',)
+INFO 11-30 19:31:13 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
+INFO 11-30 19:31:13 [llm.py:295] Supported_tasks: ('generate',)
+INFO 11-30 19:31:13 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
+INFO 11-30 19:31:13 [llm.py:295] Supported_tasks: ('generate',)
+INFO 11-30 19:31:13 [__init__.py:36] No IOProcessor plugins requested by the model
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}.
+[OpenTinker] 2025-11-30 19:31:14,416 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value.
+lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Assigned NET plugin Socket to comm
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Using network Socket
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO ncclCommSplit comm 0x4ab09c80 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 37 color 2003953581 key 1- Init START
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO ncclCommSplit comm 0x4f5333d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 37 color 2003953581 key 0- Init START
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO ncclCommSplit comm 0x4c09f530 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 37 color 2003953581 key 3- Init START
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO ncclCommSplit comm 0x4acd8820 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 37 color 2003953581 key 2- Init START
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO comm 0x4acd8820 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO comm 0x4c09f530 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO comm 0x4f5333d0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO comm 0x4ab09c80 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 00/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 01/24 : 0 1 2 3
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 02/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 03/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 04/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 05/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 06/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 07/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 08/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 09/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 10/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 11/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 12/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 13/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 14/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 15/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 16/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 17/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 18/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 19/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 20/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 21/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 22/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 23/24 : 0 1 2 3
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO P2P Chunksize set to 524288
+lshn-qs-e9wz-2:398356:400035 [2] NCCL INFO [Proxy Service] Device 2 CPU core 110
+lshn-qs-e9wz-2:398356:400036 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 114
+lshn-qs-e9wz-2:398355:400037 [1] NCCL INFO [Proxy Service] Device 1 CPU core 25
+lshn-qs-e9wz-2:398355:400038 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 125
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0
+lshn-qs-e9wz-2:398354:400039 [0] NCCL INFO [Proxy Service] Device 0 CPU core 32
+lshn-qs-e9wz-2:398354:400040 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 24
+lshn-qs-e9wz-2:398357:400041 [3] NCCL INFO [Proxy Service] Device 3 CPU core 123
+lshn-qs-e9wz-2:398357:400042 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 140
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO CC Off, workFifoBytes 1048576
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO ncclCommSplit comm 0x4c09f530 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 37 color 2003953581 key 3 - Init COMPLETE
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO ncclCommSplit comm 0x4ab09c80 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 37 color 2003953581 key 1 - Init COMPLETE
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO ncclCommSplit comm 0x4f5333d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 37 color 2003953581 key 0 - Init COMPLETE
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO ncclCommSplit comm 0x4acd8820 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 37 color 2003953581 key 2 - Init COMPLETE
+lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.01)
+lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.03, rest 0.01)
+lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.02)
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM
+lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
+INFO 11-30 19:31:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:31:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:31:15 [block_pool.py:292] Successfully reset prefix cache
+wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
+
+  0%|          | 0/1024 [00:00<?, ?it/s][AINFO 11-30 19:31:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:31:18 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 11-30 19:31:18 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 11-30 19:31:18 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+INFO 11-30 19:31:19 [chat_utils.py:538] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 1/1024 [02:24<41:02:40, 144.44s/it][A
+                                                    [A{'loss': 0.0, 'grad_norm': 0.0040327501483261585, 'learning_rate': 1e-05, 'num_tokens': 792270.0, 'completions/mean_length': 6039.171875, 'completions/min_length': 250.0, 'completions/max_length': 15689.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6039.171875, 'completions/min_terminated_length': 250.0, 'completions/max_terminated_length': 15689.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020948849618434906, 'sampling/sampling_logp_difference/max': 2.2866344451904297, 'sampling/importance_sampling_ratio/min': 0.10160785913467407, 'sampling/importance_sampling_ratio/mean': 1.0000098943710327, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 0.0, 'epoch': 0.0}
+
+  0%|          | 1/1024 [02:24<41:02:40, 144.44s/it][AINFO 11-30 19:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:33:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 2/1024 [04:41<39:41:51, 139.84s/it][A
+                                                    [A{'loss': -0.0, 'grad_norm': 0.00435988511890173, 'learning_rate': 1e-05, 'num_tokens': 1450225.0, 'completions/mean_length': 4958.0234375, 'completions/min_length': 556.0, 'completions/max_length': 15323.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4958.0234375, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 15323.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.31011277437210083, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018736306577920914, 'sampling/sampling_logp_difference/max': 1.8555700778961182, 'sampling/importance_sampling_ratio/min': 0.15636378526687622, 'sampling/importance_sampling_ratio/mean': 1.0000195503234863, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.089610599246953e-05, 'epoch': 0.0}
+
+  0%|          | 2/1024 [04:41<39:41:51, 139.84s/it][AINFO 11-30 19:35:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:35:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:35:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:35:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 3/1024 [07:18<41:53:51, 147.73s/it][A
+                                                    [A{'loss': 0.0, 'grad_norm': 0.0019109727581962943, 'learning_rate': 1e-05, 'num_tokens': 2319536.0, 'completions/mean_length': 6646.4296875, 'completions/min_length': 774.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6412.728515625, 'completions/min_terminated_length': 774.0, 'completions/max_terminated_length': 16268.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020154934376478195, 'sampling/sampling_logp_difference/max': 2.3241114616394043, 'sampling/importance_sampling_ratio/min': 0.0978703647851944, 'sampling/importance_sampling_ratio/mean': 0.9999492764472961, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.675667426250584e-05, 'epoch': 0.0}
+
+  0%|          | 3/1024 [07:18<41:53:51, 147.73s/it][AINFO 11-30 19:38:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:38:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:38:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:38:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 4/1024 [09:44<41:44:10, 147.30s/it][A
+                                                    [A{'loss': 0.0, 'grad_norm': 0.003392312675714493, 'learning_rate': 1e-05, 'num_tokens': 3076438.0, 'completions/mean_length': 5722.671875, 'completions/min_length': 264.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5553.44482421875, 'completions/min_terminated_length': 264.0, 'completions/max_terminated_length': 15628.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01987290009856224, 'sampling/sampling_logp_difference/max': 4.279532432556152, 'sampling/importance_sampling_ratio/min': 0.013849136419594288, 'sampling/importance_sampling_ratio/mean': 0.9999923706054688, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.18524738531778e-05, 'epoch': 0.0}
+
+  0%|          | 4/1024 [09:44<41:44:10, 147.30s/it][AINFO 11-30 19:41:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:41:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:41:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:41:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  0%|          | 5/1024 [12:04<40:55:17, 144.57s/it][A
+                                                    [A{'loss': -0.0, 'grad_norm': 0.0026077590882778168, 'learning_rate': 1e-05, 'num_tokens': 3747671.0, 'completions/mean_length': 5083.3203125, 'completions/min_length': 4.0, 'completions/max_length': 15637.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5083.3203125, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15637.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.20069602131843567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01915888860821724, 'sampling/sampling_logp_difference/max': 2.0136964321136475, 'sampling/importance_sampling_ratio/min': 0.1334943026304245, 'sampling/importance_sampling_ratio/mean': 0.9999581575393677, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.242316643081722e-05, 'epoch': 0.0}
+
+  0%|          | 5/1024 [12:04<40:55:17, 144.57s/it][AINFO 11-30 19:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:43:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|          | 6/1024 [14:03<38:23:11, 135.75s/it][A
+                                                    [A{'loss': 0.0, 'grad_norm': 0.0026839568745344877, 'learning_rate': 1e-05, 'num_tokens': 4365293.0, 'completions/mean_length': 4679.796875, 'completions/min_length': 230.0, 'completions/max_length': 14356.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4679.796875, 'completions/min_terminated_length': 230.0, 'completions/max_terminated_length': 14356.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.36796674132347107, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017540350556373596, 'sampling/sampling_logp_difference/max': 2.4150607585906982, 'sampling/importance_sampling_ratio/min': 0.1559910923242569, 'sampling/importance_sampling_ratio/mean': 1.0000786781311035, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.599262815143447e-05, 'epoch': 0.01}
+
+  1%|          | 6/1024 [14:03<38:23:11, 135.75s/it][AINFO 11-30 19:45:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:45:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:45:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:45:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|          | 7/1024 [16:35<39:50:18, 141.02s/it][A
+                                                    [A{'loss': 0.0001, 'grad_norm': 0.002131110057234764, 'learning_rate': 1e-05, 'num_tokens': 5161635.0, 'completions/mean_length': 6071.671875, 'completions/min_length': 502.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5907.984375, 'completions/min_terminated_length': 502.0, 'completions/max_terminated_length': 16035.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.21488475799560547, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020029261708259583, 'sampling/sampling_logp_difference/max': 2.7412960529327393, 'sampling/importance_sampling_ratio/min': 0.06448671966791153, 'sampling/importance_sampling_ratio/mean': 0.9999544620513916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.3293280037250952e-05, 'epoch': 0.01}
+
+  1%|          | 7/1024 [16:35<39:50:18, 141.02s/it][AINFO 11-30 19:47:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:47:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:47:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:47:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|          | 8/1024 [19:08<40:54:54, 144.98s/it][A
+                                                    [A{'loss': -0.0, 'grad_norm': 0.0021462184377014637, 'learning_rate': 1e-05, 'num_tokens': 5925239.0, 'completions/mean_length': 5815.03125, 'completions/min_length': 383.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5474.0966796875, 'completions/min_terminated_length': 383.0, 'completions/max_terminated_length': 14394.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01970522105693817, 'sampling/sampling_logp_difference/max': 3.086879014968872, 'sampling/importance_sampling_ratio/min': 0.045644186437129974, 'sampling/importance_sampling_ratio/mean': 1.0000536441802979, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.639496905336273e-05, 'epoch': 0.01}
+
+  1%|          | 8/1024 [19:08<40:54:54, 144.98s/it][AINFO 11-30 19:50:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:50:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:50:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:50:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|          | 9/1024 [20:58<37:46:48, 134.00s/it][A
+                                                    [A{'loss': -0.0, 'grad_norm': 0.0019890516996383667, 'learning_rate': 1e-05, 'num_tokens': 6434900.0, 'completions/mean_length': 3835.4765625, 'completions/min_length': 100.0, 'completions/max_length': 14055.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3835.4765625, 'completions/min_terminated_length': 100.0, 'completions/max_terminated_length': 14055.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2688046097755432, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015238583087921143, 'sampling/sampling_logp_difference/max': 2.5898101329803467, 'sampling/importance_sampling_ratio/min': 0.07503428310155869, 'sampling/importance_sampling_ratio/mean': 0.9999893307685852, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.706731510850659e-05, 'epoch': 0.01}
+
+  1%|          | 9/1024 [20:58<37:46:48, 134.00s/it][AINFO 11-30 19:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:52:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:52:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|          | 10/1024 [23:31<39:26:12, 140.01s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.0032330837566405535, 'learning_rate': 1e-05, 'num_tokens': 7112942.0, 'completions/mean_length': 5163.203125, 'completions/min_length': 538.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4893.904296875, 'completions/min_terminated_length': 538.0, 'completions/max_terminated_length': 14245.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.39926254749298096, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.015937291085720062, 'sampling/sampling_logp_difference/max': 3.137075424194336, 'sampling/importance_sampling_ratio/min': 0.0434095673263073, 'sampling/importance_sampling_ratio/mean': 1.0000337362289429, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.005580285593169e-05, 'epoch': 0.01}
+
+  1%|          | 10/1024 [23:31<39:26:12, 140.01s/it][AINFO 11-30 19:54:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:54:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:54:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:54:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|          | 11/1024 [25:42<38:38:05, 137.30s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.003946480806916952, 'learning_rate': 1e-05, 'num_tokens': 7730390.0, 'completions/mean_length': 4668.875, 'completions/min_length': 272.0, 'completions/max_length': 15373.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4668.875, 'completions/min_terminated_length': 272.0, 'completions/max_terminated_length': 15373.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.18595287203788757, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.018118487671017647, 'sampling/sampling_logp_difference/max': 3.047102689743042, 'sampling/importance_sampling_ratio/min': 0.047496337443590164, 'sampling/importance_sampling_ratio/mean': 1.0000364780426025, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.7124882106145378e-05, 'epoch': 0.01}
+
+  1%|          | 11/1024 [25:42<38:38:05, 137.30s/it][AINFO 11-30 19:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:56:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|          | 12/1024 [28:24<40:40:23, 144.69s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.0019970594439655542, 'learning_rate': 1e-05, 'num_tokens': 8562946.0, 'completions/mean_length': 6343.15625, 'completions/min_length': 212.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6019.2578125, 'completions/min_terminated_length': 212.0, 'completions/max_terminated_length': 16187.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3214184641838074, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019836850464344025, 'sampling/sampling_logp_difference/max': 3.1400341987609863, 'sampling/importance_sampling_ratio/min': 0.04328132048249245, 'sampling/importance_sampling_ratio/mean': 0.9999914169311523, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.592260761957732e-05, 'epoch': 0.01}
+
+  1%|          | 12/1024 [28:24<40:40:23, 144.69s/it][AINFO 11-30 19:59:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:59:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:59:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 19:59:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|▏         | 13/1024 [30:52<40:53:54, 145.63s/it][A
+                                                     [A{'loss': -0.0, 'grad_norm': 0.0014140807325020432, 'learning_rate': 1e-05, 'num_tokens': 9393912.0, 'completions/mean_length': 6320.296875, 'completions/min_length': 58.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6241.05517578125, 'completions/min_terminated_length': 58.0, 'completions/max_terminated_length': 14129.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.1643974632024765, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020306462422013283, 'sampling/sampling_logp_difference/max': 3.6733789443969727, 'sampling/importance_sampling_ratio/min': 0.025390533730387688, 'sampling/importance_sampling_ratio/mean': 1.0000437498092651, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.2711259305197018e-05, 'epoch': 0.01}
+
+  1%|▏         | 13/1024 [30:52<40:53:54, 145.63s/it][AINFO 11-30 20:02:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:02:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:02:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:02:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|▏         | 14/1024 [32:58<39:13:01, 139.78s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.0027572764083743095, 'learning_rate': 1e-05, 'num_tokens': 10053466.0, 'completions/mean_length': 4998.515625, 'completions/min_length': 326.0, 'completions/max_length': 14576.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4998.515625, 'completions/min_terminated_length': 326.0, 'completions/max_terminated_length': 14576.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2856566905975342, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019162334501743317, 'sampling/sampling_logp_difference/max': 1.5767145156860352, 'sampling/importance_sampling_ratio/min': 0.259263277053833, 'sampling/importance_sampling_ratio/mean': 0.9999816417694092, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.457231307242182e-05, 'epoch': 0.01}
+
+  1%|▏         | 14/1024 [32:58<39:13:01, 139.78s/it][AINFO 11-30 20:04:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:04:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:04:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:04:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  1%|▏         | 15/1024 [35:16<39:02:54, 139.32s/it][A
+                                                     [A{'loss': -0.0, 'grad_norm': 0.00292862462811172, 'learning_rate': 1e-05, 'num_tokens': 10682960.0, 'completions/mean_length': 4757.796875, 'completions/min_length': 18.0, 'completions/max_length': 15795.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4757.796875, 'completions/min_terminated_length': 18.0, 'completions/max_terminated_length': 15795.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.28117600083351135, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019605014473199844, 'sampling/sampling_logp_difference/max': 2.302217483520508, 'sampling/importance_sampling_ratio/min': 0.1194421648979187, 'sampling/importance_sampling_ratio/mean': 0.9999508261680603, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.840768130838114e-05, 'epoch': 0.01}
+
+  1%|▏         | 15/1024 [35:16<39:02:54, 139.32s/it][AINFO 11-30 20:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:06:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:06:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 16/1024 [37:38<39:10:38, 139.92s/it][A
+                                                     [A{'loss': -0.0, 'grad_norm': 0.0027386173605918884, 'learning_rate': 1e-05, 'num_tokens': 11414841.0, 'completions/mean_length': 5577.4453125, 'completions/min_length': 622.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5492.3544921875, 'completions/min_terminated_length': 622.0, 'completions/max_terminated_length': 15363.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018583202734589577, 'sampling/sampling_logp_difference/max': 4.0097270011901855, 'sampling/importance_sampling_ratio/min': 0.018138347193598747, 'sampling/importance_sampling_ratio/mean': 1.000007152557373, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.9595267531258287e-05, 'epoch': 0.01}
+
+  2%|▏         | 16/1024 [37:38<39:10:38, 139.92s/it][AINFO 11-30 20:08:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:08:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:08:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:08:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 17/1024 [40:07<39:56:37, 142.80s/it][A
+                                                     [A{'loss': -0.0001, 'grad_norm': 0.004777677357196808, 'learning_rate': 1e-05, 'num_tokens': 12078932.0, 'completions/mean_length': 5017.5234375, 'completions/min_length': 15.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4744.72802734375, 'completions/min_terminated_length': 15.0, 'completions/max_terminated_length': 13259.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.29644322395324707, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018336256965994835, 'sampling/sampling_logp_difference/max': 2.1398420333862305, 'sampling/importance_sampling_ratio/min': 0.14092504978179932, 'sampling/importance_sampling_ratio/mean': 1.0000548362731934, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1305933311596164e-05, 'epoch': 0.02}
+
+  2%|▏         | 17/1024 [40:07<39:56:37, 142.80s/it][AINFO 11-30 20:11:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:11:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:11:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:11:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 18/1024 [42:42<40:54:53, 146.42s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.003229200141504407, 'learning_rate': 1e-05, 'num_tokens': 12949934.0, 'completions/mean_length': 6649.203125, 'completions/min_length': 525.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6415.568359375, 'completions/min_terminated_length': 525.0, 'completions/max_terminated_length': 14874.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.29249149560928345, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017176847904920578, 'sampling/sampling_logp_difference/max': 1.9438819885253906, 'sampling/importance_sampling_ratio/min': 0.14314717054367065, 'sampling/importance_sampling_ratio/mean': 0.9999781250953674, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.454851403399516e-05, 'epoch': 0.02}
+
+  2%|▏         | 18/1024 [42:42<40:54:53, 146.42s/it][AINFO 11-30 20:13:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:13:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:13:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:13:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 19/1024 [45:13<41:14:24, 147.73s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.0022644924465566874, 'learning_rate': 1e-05, 'num_tokens': 13643556.0, 'completions/mean_length': 5261.109375, 'completions/min_length': 805.0, 'completions/max_length': 16279.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5261.109375, 'completions/min_terminated_length': 805.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2937847375869751, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017171338200569153, 'sampling/sampling_logp_difference/max': 3.0889134407043457, 'sampling/importance_sampling_ratio/min': 0.045551422983407974, 'sampling/importance_sampling_ratio/mean': 1.0000569820404053, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.204073885854086e-05, 'epoch': 0.02}
+
+  2%|▏         | 19/1024 [45:13<41:14:24, 147.73s/it][AINFO 11-30 20:16:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:16:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:16:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:16:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 20/1024 [47:02<37:58:00, 136.14s/it][A
+                                                     [A{'loss': 0.0001, 'grad_norm': 0.002566170645877719, 'learning_rate': 1e-05, 'num_tokens': 14225630.0, 'completions/mean_length': 4414.078125, 'completions/min_length': 196.0, 'completions/max_length': 12488.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4414.078125, 'completions/min_terminated_length': 196.0, 'completions/max_terminated_length': 12488.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01793120801448822, 'sampling/sampling_logp_difference/max': 6.218426704406738, 'sampling/importance_sampling_ratio/min': 0.0019923774525523186, 'sampling/importance_sampling_ratio/mean': 1.0000309944152832, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.731568560600863e-05, 'epoch': 0.02}
+
+  2%|▏         | 20/1024 [47:02<37:58:00, 136.14s/it][AINFO 11-30 20:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:18:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 21/1024 [49:02<36:34:28, 131.27s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.002611492294818163, 'learning_rate': 1e-05, 'num_tokens': 14893668.0, 'completions/mean_length': 5064.609375, 'completions/min_length': 476.0, 'completions/max_length': 14101.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5064.609375, 'completions/min_terminated_length': 476.0, 'completions/max_terminated_length': 14101.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.22331714630126953, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019949907436966896, 'sampling/sampling_logp_difference/max': 1.28975248336792, 'sampling/importance_sampling_ratio/min': 0.27533891797065735, 'sampling/importance_sampling_ratio/mean': 0.9998984336853027, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.87658201866725e-05, 'epoch': 0.02}
+
+  2%|▏         | 21/1024 [49:02<36:34:28, 131.27s/it][AINFO 11-30 20:20:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:20:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:20:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:20:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 22/1024 [50:50<34:38:36, 124.47s/it][A
+                                                     [A{'loss': -0.0001, 'grad_norm': 0.005417963024228811, 'learning_rate': 1e-05, 'num_tokens': 15515639.0, 'completions/mean_length': 4698.6484375, 'completions/min_length': 690.0, 'completions/max_length': 12494.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4698.6484375, 'completions/min_terminated_length': 690.0, 'completions/max_terminated_length': 12494.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.017951633781194687, 'sampling/sampling_logp_difference/max': 1.9050612449645996, 'sampling/importance_sampling_ratio/min': 0.14881353080272675, 'sampling/importance_sampling_ratio/mean': 0.9999631643295288, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.3738869003864238e-05, 'epoch': 0.02}
+
+  2%|▏         | 22/1024 [50:50<34:38:36, 124.47s/it][AINFO 11-30 20:22:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:22:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:22:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:22:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 23/1024 [53:11<35:57:39, 129.33s/it][A
+                                                     [A{'loss': -0.0, 'grad_norm': 0.003612510859966278, 'learning_rate': 1e-05, 'num_tokens': 16322836.0, 'completions/mean_length': 6136.8515625, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6056.16552734375, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15237.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020091045647859573, 'sampling/sampling_logp_difference/max': 3.0877466201782227, 'sampling/importance_sampling_ratio/min': 0.045604605227708817, 'sampling/importance_sampling_ratio/mean': 0.9999415874481201, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.080202666045807e-05, 'epoch': 0.02}
+
+  2%|▏         | 23/1024 [53:11<35:57:39, 129.33s/it][AINFO 11-30 20:24:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:24:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:24:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:24:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 24/1024 [55:52<38:34:13, 138.85s/it][A
+                                                     [A{'loss': 0.0, 'grad_norm': 0.0025575768668204546, 'learning_rate': 1e-05, 'num_tokens': 17075801.0, 'completions/mean_length': 5722.9140625, 'completions/min_length': 258.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5553.69091796875, 'completions/min_terminated_length': 258.0, 'completions/max_terminated_length': 15883.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.292504221200943, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01926530711352825, 'sampling/sampling_logp_difference/max': 5.114993095397949, 'sampling/importance_sampling_ratio/min': 0.006006019189953804, 'sampling/importance_sampling_ratio/mean': 0.9999945759773254, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.133698050485691e-05, 'epoch': 0.02}
+
+  2%|▏         | 24/1024 [55:52<38:34:13, 138.85s/it][AINFO 11-30 20:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:27:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  2%|▏         | 25/1024 [58:21<39:21:42, 141.84s/it][A
+                                                     [A{'loss': -0.0001, 'grad_norm': 0.003992805723100901, 'learning_rate': 1e-05, 'num_tokens': 17726724.0, 'completions/mean_length': 4901.1484375, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4810.732421875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15752.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.31929677724838257, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017930708825588226, 'sampling/sampling_logp_difference/max': 4.1546125411987305, 'sampling/importance_sampling_ratio/min': 0.01569186896085739, 'sampling/importance_sampling_ratio/mean': 1.0000249147415161, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.0483802371891215e-05, 'epoch': 0.02}
+
+  2%|▏         | 25/1024 [58:21<39:21:42, 141.84s/it][AINFO 11-30 20:29:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:29:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:29:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:29:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 26/1024 [1:00:13<36:47:55, 132.74s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.001488999230787158, 'learning_rate': 1e-05, 'num_tokens': 18448080.0, 'completions/mean_length': 5491.84375, 'completions/min_length': 467.0, 'completions/max_length': 13779.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5491.84375, 'completions/min_terminated_length': 467.0, 'completions/max_terminated_length': 13779.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.28011515736579895, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01949649676680565, 'sampling/sampling_logp_difference/max': 1.683163046836853, 'sampling/importance_sampling_ratio/min': 0.18578539788722992, 'sampling/importance_sampling_ratio/mean': 0.9999741911888123, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.570172680156247e-05, 'epoch': 0.02}
+
+  3%|▎         | 26/1024 [1:00:13<36:47:55, 132.74s/it][AINFO 11-30 20:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:31:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 27/1024 [1:02:25<36:46:21, 132.78s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.013453707098960876, 'learning_rate': 1e-05, 'num_tokens': 19222315.0, 'completions/mean_length': 5900.2109375, 'completions/min_length': 58.0, 'completions/max_length': 15280.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5900.2109375, 'completions/min_terminated_length': 58.0, 'completions/max_terminated_length': 15280.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.3713914752006531, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017992667853832245, 'sampling/sampling_logp_difference/max': 1.6051416397094727, 'sampling/importance_sampling_ratio/min': 0.2615732252597809, 'sampling/importance_sampling_ratio/mean': 1.0000121593475342, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.968139066681033e-05, 'epoch': 0.02}
+
+  3%|▎         | 27/1024 [1:02:25<36:46:21, 132.78s/it][AINFO 11-30 20:33:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:33:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:33:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:33:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 28/1024 [1:04:49<37:37:43, 136.01s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002751777181401849, 'learning_rate': 1e-05, 'num_tokens': 19999152.0, 'completions/mean_length': 5905.1015625, 'completions/min_length': 629.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5822.59033203125, 'completions/min_terminated_length': 629.0, 'completions/max_terminated_length': 15091.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.35588693618774414, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019589129835367203, 'sampling/sampling_logp_difference/max': 3.553239345550537, 'sampling/importance_sampling_ratio/min': 0.02863173931837082, 'sampling/importance_sampling_ratio/mean': 0.999993085861206, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.28169755145791e-05, 'epoch': 0.03}
+
+  3%|▎         | 28/1024 [1:04:49<37:37:43, 136.01s/it][AINFO 11-30 20:36:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:36:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:36:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:36:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 29/1024 [1:07:17<38:33:25, 139.50s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0031230382155627012, 'learning_rate': 1e-05, 'num_tokens': 20700185.0, 'completions/mean_length': 5315.2578125, 'completions/min_length': 185.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5228.1025390625, 'completions/min_terminated_length': 185.0, 'completions/max_terminated_length': 16341.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018846288323402405, 'sampling/sampling_logp_difference/max': 2.6308226585388184, 'sampling/importance_sampling_ratio/min': 0.07201918959617615, 'sampling/importance_sampling_ratio/mean': 0.9999688863754272, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.466136366929277e-05, 'epoch': 0.03}
+
+  3%|▎         | 29/1024 [1:07:17<38:33:25, 139.50s/it][AINFO 11-30 20:38:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:38:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:38:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:38:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 30/1024 [1:09:16<36:49:24, 133.36s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.003184435423463583, 'learning_rate': 1e-05, 'num_tokens': 21327227.0, 'completions/mean_length': 4754.578125, 'completions/min_length': 275.0, 'completions/max_length': 13530.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4754.578125, 'completions/min_terminated_length': 275.0, 'completions/max_terminated_length': 13530.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.38611698150634766, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01872986927628517, 'sampling/sampling_logp_difference/max': 1.666948914527893, 'sampling/importance_sampling_ratio/min': 0.18882229924201965, 'sampling/importance_sampling_ratio/mean': 1.00004243850708, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.0694953390338924e-05, 'epoch': 0.03}
+
+  3%|▎         | 30/1024 [1:09:16<36:49:24, 133.36s/it][AINFO 11-30 20:40:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:40:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:40:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:40:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 31/1024 [1:11:44<38:02:42, 137.93s/it][A
+                                                       [A{'loss': 0.0001, 'grad_norm': 0.0015998827293515205, 'learning_rate': 1e-05, 'num_tokens': 22137609.0, 'completions/mean_length': 6169.671875, 'completions/min_length': 77.0, 'completions/max_length': 16295.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6169.671875, 'completions/min_terminated_length': 77.0, 'completions/max_terminated_length': 16295.0, 'rewards/accuracy_reward/mean': 0.1484375, 'rewards/accuracy_reward/std': 0.356930136680603, 'reward': 0.1484375, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021658005192875862, 'sampling/sampling_logp_difference/max': 5.996496200561523, 'sampling/importance_sampling_ratio/min': 0.002487452467903495, 'sampling/importance_sampling_ratio/mean': 0.9999616146087646, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.6688619300330174e-05, 'epoch': 0.03}
+
+  3%|▎         | 31/1024 [1:11:44<38:02:42, 137.93s/it][AINFO 11-30 20:43:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:43:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:43:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:43:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 32/1024 [1:14:15<39:06:20, 141.92s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0009680070797912776, 'learning_rate': 1e-05, 'num_tokens': 22898573.0, 'completions/mean_length': 5782.46875, 'completions/min_length': 140.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5698.9921875, 'completions/min_terminated_length': 140.0, 'completions/max_terminated_length': 15377.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01947668567299843, 'sampling/sampling_logp_difference/max': 2.6632909774780273, 'sampling/importance_sampling_ratio/min': 0.06971839815378189, 'sampling/importance_sampling_ratio/mean': 1.0000598430633545, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.213877005237009e-05, 'epoch': 0.03}
+
+  3%|▎         | 32/1024 [1:14:15<39:06:20, 141.92s/it][AINFO 11-30 20:45:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:45:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:45:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:45:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 33/1024 [1:16:27<38:13:13, 138.84s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.003982205875217915, 'learning_rate': 1e-05, 'num_tokens': 23598284.0, 'completions/mean_length': 5295.5546875, 'completions/min_length': 517.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5208.244140625, 'completions/min_terminated_length': 517.0, 'completions/max_terminated_length': 13900.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3464113473892212, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019372105598449707, 'sampling/sampling_logp_difference/max': 2.892524003982544, 'sampling/importance_sampling_ratio/min': 0.055436115711927414, 'sampling/importance_sampling_ratio/mean': 1.000054121017456, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.333903382303106e-05, 'epoch': 0.03}
+
+  3%|▎         | 33/1024 [1:16:27<38:13:13, 138.84s/it][AINFO 11-30 20:47:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:47:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:47:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:47:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 34/1024 [1:18:12<35:22:34, 128.64s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.003291479777544737, 'learning_rate': 1e-05, 'num_tokens': 24238790.0, 'completions/mean_length': 4856.828125, 'completions/min_length': 388.0, 'completions/max_length': 11953.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4856.828125, 'completions/min_terminated_length': 388.0, 'completions/max_terminated_length': 11953.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.36008089780807495, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017960648983716965, 'sampling/sampling_logp_difference/max': 3.5692081451416016, 'sampling/importance_sampling_ratio/min': 0.028178159147500992, 'sampling/importance_sampling_ratio/mean': 1.0000040531158447, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.87502479700197e-05, 'epoch': 0.03}
+
+  3%|▎         | 34/1024 [1:18:12<35:22:34, 128.64s/it][AINFO 11-30 20:49:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:49:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:49:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:49:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  3%|▎         | 35/1024 [1:20:45<37:22:01, 136.02s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0017659832956269383, 'learning_rate': 1e-05, 'num_tokens': 25085522.0, 'completions/mean_length': 6465.90625, 'completions/min_length': 11.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6227.8720703125, 'completions/min_terminated_length': 11.0, 'completions/max_terminated_length': 15338.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.23144522309303284, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019682209938764572, 'sampling/sampling_logp_difference/max': 3.765620231628418, 'sampling/importance_sampling_ratio/min': 0.02315324731171131, 'sampling/importance_sampling_ratio/mean': 0.9999986886978149, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.820113442998263e-05, 'epoch': 0.03}
+
+  3%|▎         | 35/1024 [1:20:45<37:22:01, 136.02s/it][AINFO 11-30 20:52:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:52:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:52:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:52:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▎         | 36/1024 [1:23:18<38:44:41, 141.18s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0032874022144824266, 'learning_rate': 1e-05, 'num_tokens': 25864049.0, 'completions/mean_length': 5941.3671875, 'completions/min_length': 535.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5775.611328125, 'completions/min_terminated_length': 535.0, 'completions/max_terminated_length': 16210.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.25513991713523865, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01944374293088913, 'sampling/sampling_logp_difference/max': 1.8658325672149658, 'sampling/importance_sampling_ratio/min': 0.15476730465888977, 'sampling/importance_sampling_ratio/mean': 0.9999710321426392, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.513532473993109e-05, 'epoch': 0.03}
+
+  4%|▎         | 36/1024 [1:23:18<38:44:41, 141.18s/it][AINFO 11-30 20:54:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:54:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:54:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:54:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▎         | 37/1024 [1:25:50<39:32:30, 144.22s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0026312628760933876, 'learning_rate': 1e-05, 'num_tokens': 26709974.0, 'completions/mean_length': 6416.1015625, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6337.6142578125, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 14316.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018076686188578606, 'sampling/sampling_logp_difference/max': 2.856377601623535, 'sampling/importance_sampling_ratio/min': 0.05747658759355545, 'sampling/importance_sampling_ratio/mean': 0.9999989867210388, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.743831595988013e-05, 'epoch': 0.03}
+
+  4%|▎         | 37/1024 [1:25:50<39:32:30, 144.22s/it][AINFO 11-30 20:57:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:57:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:57:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:57:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▎         | 38/1024 [1:28:32<40:58:05, 149.58s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0029591964557766914, 'learning_rate': 1e-05, 'num_tokens': 27582090.0, 'completions/mean_length': 6664.59375, 'completions/min_length': 672.0, 'completions/max_length': 16155.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6664.59375, 'completions/min_terminated_length': 672.0, 'completions/max_terminated_length': 16155.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3385029733181, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019228119403123856, 'sampling/sampling_logp_difference/max': 3.7663159370422363, 'sampling/importance_sampling_ratio/min': 0.02313714474439621, 'sampling/importance_sampling_ratio/mean': 0.9999204874038696, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.651576796277368e-05, 'epoch': 0.03}
+
+  4%|▎         | 38/1024 [1:28:32<40:58:05, 149.58s/it][AINFO 11-30 20:59:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:59:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:59:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 20:59:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 39/1024 [1:30:58<40:36:38, 148.42s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.003023772733286023, 'learning_rate': 1e-05, 'num_tokens': 28334478.0, 'completions/mean_length': 5729.15625, 'completions/min_length': 198.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5645.259765625, 'completions/min_terminated_length': 198.0, 'completions/max_terminated_length': 15071.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.30061954259872437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019314374774694443, 'sampling/sampling_logp_difference/max': 4.673653602600098, 'sampling/importance_sampling_ratio/min': 0.009338089264929295, 'sampling/importance_sampling_ratio/mean': 0.9999845623970032, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.236799054524454e-05, 'epoch': 0.04}
+
+  4%|▍         | 39/1024 [1:30:58<40:36:38, 148.42s/it][AINFO 11-30 21:02:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:02:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:02:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:02:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 40/1024 [1:33:15<39:41:52, 145.24s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.003077085129916668, 'learning_rate': 1e-05, 'num_tokens': 28997043.0, 'completions/mean_length': 5021.2265625, 'completions/min_length': 85.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4931.755859375, 'completions/min_terminated_length': 85.0, 'completions/max_terminated_length': 16382.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.29644322395324707, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019833026453852654, 'sampling/sampling_logp_difference/max': 1.3179941177368164, 'sampling/importance_sampling_ratio/min': 0.267671674489975, 'sampling/importance_sampling_ratio/mean': 1.0000052452087402, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.894369768251636e-05, 'epoch': 0.04}
+
+  4%|▍         | 40/1024 [1:33:15<39:41:52, 145.24s/it][AINFO 11-30 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:04:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 41/1024 [1:35:26<38:28:59, 140.93s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0013224197318777442, 'learning_rate': 1e-05, 'num_tokens': 29688541.0, 'completions/mean_length': 5254.640625, 'completions/min_length': 844.0, 'completions/max_length': 13940.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5254.640625, 'completions/min_terminated_length': 844.0, 'completions/max_terminated_length': 13940.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01846657320857048, 'sampling/sampling_logp_difference/max': 2.6213583946228027, 'sampling/importance_sampling_ratio/min': 0.07270403206348419, 'sampling/importance_sampling_ratio/mean': 0.999992847442627, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.2690972261661955e-05, 'epoch': 0.04}
+
+  4%|▍         | 41/1024 [1:35:26<38:28:59, 140.93s/it][AINFO 11-30 21:06:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:06:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:06:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:06:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 42/1024 [1:37:39<37:44:44, 138.38s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0017147879116237164, 'learning_rate': 1e-05, 'num_tokens': 30432331.0, 'completions/mean_length': 5629.546875, 'completions/min_length': 674.0, 'completions/max_length': 14534.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5629.546875, 'completions/min_terminated_length': 674.0, 'completions/max_terminated_length': 14534.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.3145836591720581, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020487062633037567, 'sampling/sampling_logp_difference/max': 4.648956298828125, 'sampling/importance_sampling_ratio/min': 0.009571586735546589, 'sampling/importance_sampling_ratio/mean': 0.9999871253967285, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1964653064496815e-05, 'epoch': 0.04}
+
+  4%|▍         | 42/1024 [1:37:39<37:44:44, 138.38s/it][AINFO 11-30 21:08:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:08:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:08:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:08:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 43/1024 [1:39:44<36:38:16, 134.45s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0033384806010872126, 'learning_rate': 1e-05, 'num_tokens': 31130440.0, 'completions/mean_length': 5272.4140625, 'completions/min_length': 30.0, 'completions/max_length': 14808.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5272.4140625, 'completions/min_terminated_length': 30.0, 'completions/max_terminated_length': 14808.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.361660897731781, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018863137811422348, 'sampling/sampling_logp_difference/max': 4.001441955566406, 'sampling/importance_sampling_ratio/min': 0.018289247527718544, 'sampling/importance_sampling_ratio/mean': 0.9999641180038452, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.64322323775923e-05, 'epoch': 0.04}
+
+  4%|▍         | 43/1024 [1:39:44<36:38:16, 134.45s/it][AINFO 11-30 21:11:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:11:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:11:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:11:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 44/1024 [1:42:02<36:54:04, 135.56s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0018463641172274947, 'learning_rate': 1e-05, 'num_tokens': 31738951.0, 'completions/mean_length': 4601.8046875, 'completions/min_length': 110.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4414.7861328125, 'completions/min_terminated_length': 110.0, 'completions/max_terminated_length': 16056.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01843690313398838, 'sampling/sampling_logp_difference/max': 9.641263961791992, 'sampling/importance_sampling_ratio/min': 6.499085429823026e-05, 'sampling/importance_sampling_ratio/mean': 0.9999581575393677, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.386531327327248e-05, 'epoch': 0.04}
+
+  4%|▍         | 44/1024 [1:42:02<36:54:04, 135.56s/it][AINFO 11-30 21:13:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:13:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:13:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:13:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 45/1024 [1:44:44<39:00:38, 143.45s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0011749925324693322, 'learning_rate': 1e-05, 'num_tokens': 32668985.0, 'completions/mean_length': 7086.890625, 'completions/min_length': 657.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6786.98388671875, 'completions/min_terminated_length': 657.0, 'completions/max_terminated_length': 15360.0, 'rewards/accuracy_reward/mean': 0.1484375, 'rewards/accuracy_reward/std': 0.356930136680603, 'reward': 0.1484375, 'reward_std': 0.17358636856079102, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020170794799923897, 'sampling/sampling_logp_difference/max': 1.724916934967041, 'sampling/importance_sampling_ratio/min': 0.17818784713745117, 'sampling/importance_sampling_ratio/mean': 0.9999415278434753, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.3576611397402303e-05, 'epoch': 0.04}
+
+  4%|▍         | 45/1024 [1:44:44<39:00:38, 143.45s/it][AINFO 11-30 21:16:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:16:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:16:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:16:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  4%|▍         | 46/1024 [1:47:22<40:10:52, 147.91s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0021256105974316597, 'learning_rate': 1e-05, 'num_tokens': 33522473.0, 'completions/mean_length': 6526.0, 'completions/min_length': 605.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6125.26806640625, 'completions/min_terminated_length': 605.0, 'completions/max_terminated_length': 14053.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.25224900245666504, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018983429297804832, 'sampling/sampling_logp_difference/max': 1.882314682006836, 'sampling/importance_sampling_ratio/min': 0.15223731100559235, 'sampling/importance_sampling_ratio/mean': 0.9999562501907349, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.165765881225525e-05, 'epoch': 0.04}
+
+  4%|▍         | 46/1024 [1:47:22<40:10:52, 147.91s/it][AINFO 11-30 21:18:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:18:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:18:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:18:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▍         | 47/1024 [1:49:35<38:55:58, 143.46s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002230287529528141, 'learning_rate': 1e-05, 'num_tokens': 34221781.0, 'completions/mean_length': 5234.59375, 'completions/min_length': 53.0, 'completions/max_length': 13895.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5234.59375, 'completions/min_terminated_length': 53.0, 'completions/max_terminated_length': 13895.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020987385883927345, 'sampling/sampling_logp_difference/max': 3.7045910358428955, 'sampling/importance_sampling_ratio/min': 0.024610280990600586, 'sampling/importance_sampling_ratio/mean': 0.9998403787612915, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.4951827072072774e-05, 'epoch': 0.04}
+
+  5%|▍         | 47/1024 [1:49:35<38:55:58, 143.46s/it][AINFO 11-30 21:20:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:20:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:20:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:20:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▍         | 48/1024 [1:52:16<40:17:16, 148.60s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0033349934965372086, 'learning_rate': 1e-05, 'num_tokens': 35019086.0, 'completions/mean_length': 6091.6953125, 'completions/min_length': 53.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5844.68017578125, 'completions/min_terminated_length': 53.0, 'completions/max_terminated_length': 15149.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.38505616784095764, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017815347760915756, 'sampling/sampling_logp_difference/max': 5.36215877532959, 'sampling/importance_sampling_ratio/min': 0.004690769128501415, 'sampling/importance_sampling_ratio/mean': 1.0000643730163574, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.8454100326343905e-05, 'epoch': 0.04}
+
+  5%|▍         | 48/1024 [1:52:16<40:17:16, 148.60s/it][AINFO 11-30 21:23:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:23:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:23:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:23:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▍         | 49/1024 [1:54:56<41:11:02, 152.06s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0025556134060025215, 'learning_rate': 1e-05, 'num_tokens': 35798155.0, 'completions/mean_length': 5925.9140625, 'completions/min_length': 336.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5674.92041015625, 'completions/min_terminated_length': 336.0, 'completions/max_terminated_length': 16281.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3253750801086426, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01731565222144127, 'sampling/sampling_logp_difference/max': 2.642024517059326, 'sampling/importance_sampling_ratio/min': 0.07121694087982178, 'sampling/importance_sampling_ratio/mean': 1.0000495910644531, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.819393790787217e-05, 'epoch': 0.05}
+
+  5%|▍         | 49/1024 [1:54:56<41:11:02, 152.06s/it][AINFO 11-30 21:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:26:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▍         | 50/1024 [1:57:35<41:41:48, 154.12s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0023415833711624146, 'learning_rate': 1e-05, 'num_tokens': 36580774.0, 'completions/mean_length': 5978.3984375, 'completions/min_length': 204.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5728.6640625, 'completions/min_terminated_length': 204.0, 'completions/max_terminated_length': 14714.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.266974538564682, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02070978656411171, 'sampling/sampling_logp_difference/max': 8.360030174255371, 'sampling/importance_sampling_ratio/min': 0.00023403727391269058, 'sampling/importance_sampling_ratio/mean': 0.9996204376220703, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5808319555362687e-05, 'epoch': 0.05}
+
+  5%|▍         | 50/1024 [1:57:35<41:41:48, 154.12s/it][AINFO 11-30 21:28:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:28:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:28:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:28:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▍         | 51/1024 [2:00:10<41:44:36, 154.45s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.001733646378852427, 'learning_rate': 1e-05, 'num_tokens': 37478538.0, 'completions/mean_length': 6825.34375, 'completions/min_length': 152.0, 'completions/max_length': 15566.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6825.34375, 'completions/min_terminated_length': 152.0, 'completions/max_terminated_length': 15566.0, 'rewards/accuracy_reward/mean': 0.2109375, 'rewards/accuracy_reward/std': 0.4095771610736847, 'reward': 0.2109375, 'reward_std': 0.26485776901245117, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02004232443869114, 'sampling/sampling_logp_difference/max': 3.061105728149414, 'sampling/importance_sampling_ratio/min': 0.04683587700128555, 'sampling/importance_sampling_ratio/mean': 1.000016689300537, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.8137233332236065e-05, 'epoch': 0.05}
+
+  5%|▍         | 51/1024 [2:00:10<41:44:36, 154.45s/it][AINFO 11-30 21:31:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:31:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:31:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:31:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▌         | 52/1024 [2:02:24<40:04:01, 148.40s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0027474171947687864, 'learning_rate': 1e-05, 'num_tokens': 38208862.0, 'completions/mean_length': 5526.96875, 'completions/min_length': 260.0, 'completions/max_length': 13967.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5526.96875, 'completions/min_terminated_length': 260.0, 'completions/max_terminated_length': 13967.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020138733088970184, 'sampling/sampling_logp_difference/max': 1.7703008651733398, 'sampling/importance_sampling_ratio/min': 0.17028175294399261, 'sampling/importance_sampling_ratio/mean': 1.0000911951065063, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.2309794278262416e-05, 'epoch': 0.05}
+
+  5%|▌         | 52/1024 [2:02:24<40:04:01, 148.40s/it][AINFO 11-30 21:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:33:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▌         | 53/1024 [2:04:42<39:09:58, 145.21s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0023102632258087397, 'learning_rate': 1e-05, 'num_tokens': 38986441.0, 'completions/mean_length': 5925.2109375, 'completions/min_length': 507.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5759.19873046875, 'completions/min_terminated_length': 507.0, 'completions/max_terminated_length': 14637.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.22567614912986755, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020217979326844215, 'sampling/sampling_logp_difference/max': 1.3329081535339355, 'sampling/importance_sampling_ratio/min': 0.33553048968315125, 'sampling/importance_sampling_ratio/mean': 0.9999989867210388, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.059891464043176e-05, 'epoch': 0.05}
+
+  5%|▌         | 53/1024 [2:04:42<39:09:58, 145.21s/it][AINFO 11-30 21:35:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:35:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:35:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:35:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▌         | 54/1024 [2:07:14<39:36:56, 147.03s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.00286587281152606, 'learning_rate': 1e-05, 'num_tokens': 39737230.0, 'completions/mean_length': 5722.9140625, 'completions/min_length': 83.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5467.04833984375, 'completions/min_terminated_length': 83.0, 'completions/max_terminated_length': 14353.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.31930169463157654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01887870952486992, 'sampling/sampling_logp_difference/max': 1.8005337715148926, 'sampling/importance_sampling_ratio/min': 0.16521067917346954, 'sampling/importance_sampling_ratio/mean': 1.0000033378601074, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.101698436898005e-05, 'epoch': 0.05}
+
+  5%|▌         | 54/1024 [2:07:14<39:36:56, 147.03s/it][AINFO 11-30 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:38:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▌         | 55/1024 [2:09:30<38:42:44, 143.82s/it][A
+                                                       [A{'loss': -0.0002, 'grad_norm': 0.0030615110881626606, 'learning_rate': 1e-05, 'num_tokens': 40369133.0, 'completions/mean_length': 4784.8046875, 'completions/min_length': 217.0, 'completions/max_length': 15489.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4784.8046875, 'completions/min_terminated_length': 217.0, 'completions/max_terminated_length': 15489.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3185402750968933, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018476612865924835, 'sampling/sampling_logp_difference/max': 2.8061938285827637, 'sampling/importance_sampling_ratio/min': 0.060434579849243164, 'sampling/importance_sampling_ratio/mean': 0.9999684691429138, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.4379004723159596e-05, 'epoch': 0.05}
+
+  5%|▌         | 55/1024 [2:09:30<38:42:44, 143.82s/it][AINFO 11-30 21:40:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:40:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:40:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:40:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  5%|▌         | 56/1024 [2:12:19<40:44:13, 151.50s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0035595207009464502, 'learning_rate': 1e-05, 'num_tokens': 41228119.0, 'completions/mean_length': 6526.140625, 'completions/min_length': 260.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6210.33056640625, 'completions/min_terminated_length': 260.0, 'completions/max_terminated_length': 15789.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.28353503346443176, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01736801490187645, 'sampling/sampling_logp_difference/max': 2.39286470413208, 'sampling/importance_sampling_ratio/min': 0.1460556536912918, 'sampling/importance_sampling_ratio/mean': 0.9999958872795105, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.474045820439642e-05, 'epoch': 0.05}
+
+  5%|▌         | 56/1024 [2:12:19<40:44:13, 151.50s/it][AINFO 11-30 21:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:43:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▌         | 57/1024 [2:14:46<40:16:44, 149.95s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.003246234729886055, 'learning_rate': 1e-05, 'num_tokens': 42094926.0, 'completions/mean_length': 6621.8046875, 'completions/min_length': 264.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6544.93701171875, 'completions/min_terminated_length': 264.0, 'completions/max_terminated_length': 15756.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.3066929280757904, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02005051076412201, 'sampling/sampling_logp_difference/max': 2.960735559463501, 'sampling/importance_sampling_ratio/min': 0.051780816167593, 'sampling/importance_sampling_ratio/mean': 0.999980092048645, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.461227303589112e-05, 'epoch': 0.05}
+
+  6%|▌         | 57/1024 [2:14:46<40:16:44, 149.95s/it][AINFO 11-30 21:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:46:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▌         | 58/1024 [2:17:03<39:12:22, 146.11s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0021836543455719948, 'learning_rate': 1e-05, 'num_tokens': 42720566.0, 'completions/mean_length': 4745.5625, 'completions/min_length': 472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4466.240234375, 'completions/min_terminated_length': 472.0, 'completions/max_terminated_length': 14256.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018924405798316002, 'sampling/sampling_logp_difference/max': 8.609021186828613, 'sampling/importance_sampling_ratio/min': 0.00018245240789838135, 'sampling/importance_sampling_ratio/mean': 1.0000442266464233, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.228890182072064e-05, 'epoch': 0.05}
+
+  6%|▌         | 58/1024 [2:17:03<39:12:22, 146.11s/it][AINFO 11-30 21:48:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:48:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:48:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:48:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▌         | 59/1024 [2:18:41<35:17:59, 131.69s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.003501354483887553, 'learning_rate': 1e-05, 'num_tokens': 43317942.0, 'completions/mean_length': 4529.125, 'completions/min_length': 32.0, 'completions/max_length': 11583.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4529.125, 'completions/min_terminated_length': 32.0, 'completions/max_terminated_length': 11583.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.341156542301178, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018559476360678673, 'sampling/sampling_logp_difference/max': 1.2528820037841797, 'sampling/importance_sampling_ratio/min': 0.28568026423454285, 'sampling/importance_sampling_ratio/mean': 1.0000299215316772, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.9958335719347815e-05, 'epoch': 0.05}
+
+  6%|▌         | 59/1024 [2:18:41<35:17:59, 131.69s/it][AINFO 11-30 21:49:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:49:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:49:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:49:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▌         | 60/1024 [2:20:53<35:15:57, 131.70s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0021927307825535536, 'learning_rate': 1e-05, 'num_tokens': 44004932.0, 'completions/mean_length': 5225.984375, 'completions/min_length': 540.0, 'completions/max_length': 15216.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5225.984375, 'completions/min_terminated_length': 540.0, 'completions/max_terminated_length': 15216.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.29826053977012634, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01917683705687523, 'sampling/sampling_logp_difference/max': 4.181803226470947, 'sampling/importance_sampling_ratio/min': 0.01527094654738903, 'sampling/importance_sampling_ratio/mean': 0.999963641166687, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.8730497180949897e-05, 'epoch': 0.06}
+
+  6%|▌         | 60/1024 [2:20:53<35:15:57, 131.70s/it][AINFO 11-30 21:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:52:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▌         | 61/1024 [2:22:52<34:16:36, 128.14s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0029622926376760006, 'learning_rate': 1e-05, 'num_tokens': 44644930.0, 'completions/mean_length': 4849.609375, 'completions/min_length': 694.0, 'completions/max_length': 13501.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4849.609375, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 13501.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01774633303284645, 'sampling/sampling_logp_difference/max': 2.597506523132324, 'sampling/importance_sampling_ratio/min': 0.074459008872509, 'sampling/importance_sampling_ratio/mean': 1.0000462532043457, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.137683142995229e-05, 'epoch': 0.06}
+
+  6%|▌         | 61/1024 [2:22:52<34:16:36, 128.14s/it][AINFO 11-30 21:54:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:54:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:54:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:54:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▌         | 62/1024 [2:25:32<36:47:01, 137.65s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.006025045644491911, 'learning_rate': 1e-05, 'num_tokens': 45447489.0, 'completions/mean_length': 6126.4921875, 'completions/min_length': 499.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6045.724609375, 'completions/min_terminated_length': 499.0, 'completions/max_terminated_length': 16242.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.26485776901245117, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018755871802568436, 'sampling/sampling_logp_difference/max': 4.132395267486572, 'sampling/importance_sampling_ratio/min': 0.01604440063238144, 'sampling/importance_sampling_ratio/mean': 0.9999797344207764, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.551837869508745e-05, 'epoch': 0.06}
+
+  6%|▌         | 62/1024 [2:25:32<36:47:01, 137.65s/it][AINFO 11-30 21:56:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:56:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:56:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:56:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▌         | 63/1024 [2:28:35<40:22:10, 151.23s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.002317876787856221, 'learning_rate': 1e-05, 'num_tokens': 46300777.0, 'completions/mean_length': 6506.3125, 'completions/min_length': 163.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6428.53564453125, 'completions/min_terminated_length': 163.0, 'completions/max_terminated_length': 16125.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.30745434761047363, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01887967810034752, 'sampling/sampling_logp_difference/max': 4.3882856369018555, 'sampling/importance_sampling_ratio/min': 0.012422007508575916, 'sampling/importance_sampling_ratio/mean': 1.0000250339508057, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.30371132754226e-05, 'epoch': 0.06}
+
+  6%|▌         | 63/1024 [2:28:35<40:22:10, 151.23s/it][AINFO 11-30 21:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:59:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 21:59:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▋         | 64/1024 [2:30:33<37:37:32, 141.10s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0022679027169942856, 'learning_rate': 1e-05, 'num_tokens': 47079942.0, 'completions/mean_length': 5938.6015625, 'completions/min_length': 330.0, 'completions/max_length': 12890.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5938.6015625, 'completions/min_terminated_length': 330.0, 'completions/max_terminated_length': 12890.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.34033793210983276, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01814691722393036, 'sampling/sampling_logp_difference/max': 1.713322401046753, 'sampling/importance_sampling_ratio/min': 0.1802658885717392, 'sampling/importance_sampling_ratio/mean': 0.9999797344207764, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.02081663600984e-05, 'epoch': 0.06}
+
+  6%|▋         | 64/1024 [2:30:33<37:37:32, 141.10s/it][AINFO 11-30 22:01:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:01:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:01:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:01:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▋         | 65/1024 [2:32:49<37:12:44, 139.69s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.002760152332484722, 'learning_rate': 1e-05, 'num_tokens': 47673478.0, 'completions/mean_length': 4488.375, 'completions/min_length': 320.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4394.70849609375, 'completions/min_terminated_length': 320.0, 'completions/max_terminated_length': 14963.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2977364659309387, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018656963482499123, 'sampling/sampling_logp_difference/max': 1.0832443237304688, 'sampling/importance_sampling_ratio/min': 0.37111321091651917, 'sampling/importance_sampling_ratio/mean': 1.0000345706939697, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.07767131239234e-05, 'epoch': 0.06}
+
+  6%|▋         | 65/1024 [2:32:49<37:12:44, 139.69s/it][AINFO 11-30 22:04:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:04:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:04:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:04:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  6%|▋         | 66/1024 [2:35:20<38:04:45, 143.10s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0022162001114338636, 'learning_rate': 1e-05, 'num_tokens': 48449741.0, 'completions/mean_length': 5911.3046875, 'completions/min_length': 784.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5828.84228515625, 'completions/min_terminated_length': 784.0, 'completions/max_terminated_length': 14782.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.27062684297561646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01898515224456787, 'sampling/sampling_logp_difference/max': 4.473962306976318, 'sampling/importance_sampling_ratio/min': 0.011402048170566559, 'sampling/importance_sampling_ratio/mean': 1.00001859664917, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.181066492492391e-05, 'epoch': 0.06}
+
+  6%|▋         | 66/1024 [2:35:20<38:04:45, 143.10s/it][AINFO 11-30 22:06:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:06:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:06:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:06:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 67/1024 [2:37:47<38:22:19, 144.35s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0019079719204455614, 'learning_rate': 1e-05, 'num_tokens': 49250846.0, 'completions/mean_length': 6106.0078125, 'completions/min_length': 153.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6025.07861328125, 'completions/min_terminated_length': 153.0, 'completions/max_terminated_length': 16010.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.3435155153274536, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020224664360284805, 'sampling/sampling_logp_difference/max': 2.1085758209228516, 'sampling/importance_sampling_ratio/min': 0.12141075730323792, 'sampling/importance_sampling_ratio/mean': 1.0000361204147339, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.790763748336758e-05, 'epoch': 0.06}
+
+  7%|▋         | 67/1024 [2:37:47<38:22:19, 144.35s/it][AINFO 11-30 22:09:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:09:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:09:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:09:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 68/1024 [2:40:50<41:21:04, 155.72s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0014431207673624158, 'learning_rate': 1e-05, 'num_tokens': 50052530.0, 'completions/mean_length': 6116.21875, 'completions/min_length': 229.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5698.8291015625, 'completions/min_terminated_length': 229.0, 'completions/max_terminated_length': 16030.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016502579674124718, 'sampling/sampling_logp_difference/max': 2.3744759559631348, 'sampling/importance_sampling_ratio/min': 0.09306324273347855, 'sampling/importance_sampling_ratio/mean': 0.9999993443489075, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.370734609437932e-05, 'epoch': 0.06}
+
+  7%|▋         | 68/1024 [2:40:50<41:21:04, 155.72s/it][AINFO 11-30 22:12:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:12:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:12:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:12:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 69/1024 [2:43:11<40:10:00, 151.41s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0023658890277147293, 'learning_rate': 1e-05, 'num_tokens': 50783648.0, 'completions/mean_length': 5547.671875, 'completions/min_length': 115.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5462.3466796875, 'completions/min_terminated_length': 115.0, 'completions/max_terminated_length': 14191.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.3135228157043457, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01962287351489067, 'sampling/sampling_logp_difference/max': 1.8994135856628418, 'sampling/importance_sampling_ratio/min': 0.17115391790866852, 'sampling/importance_sampling_ratio/mean': 0.9999957084655762, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3898592309687956e-05, 'epoch': 0.06}
+
+  7%|▋         | 69/1024 [2:43:11<40:10:00, 151.41s/it][AINFO 11-30 22:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:14:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:14:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 70/1024 [2:45:20<38:19:39, 144.63s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.002180900424718857, 'learning_rate': 1e-05, 'num_tokens': 51454670.0, 'completions/mean_length': 5101.609375, 'completions/min_length': 391.0, 'completions/max_length': 15322.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5101.609375, 'completions/min_terminated_length': 391.0, 'completions/max_terminated_length': 15322.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.32035762071609497, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01861066371202469, 'sampling/sampling_logp_difference/max': 1.7193742990493774, 'sampling/importance_sampling_ratio/min': 0.17917822301387787, 'sampling/importance_sampling_ratio/mean': 0.99993896484375, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.846130341069511e-05, 'epoch': 0.06}
+
+  7%|▋         | 70/1024 [2:45:20<38:19:39, 144.63s/it][AINFO 11-30 22:16:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:16:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:16:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:16:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 71/1024 [2:48:08<40:12:04, 151.86s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0007782382308505476, 'learning_rate': 1e-05, 'num_tokens': 52347345.0, 'completions/mean_length': 6836.6484375, 'completions/min_length': 643.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6685.103515625, 'completions/min_terminated_length': 643.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017684806138277054, 'sampling/sampling_logp_difference/max': 9.202239036560059, 'sampling/importance_sampling_ratio/min': 0.00010081342043122277, 'sampling/importance_sampling_ratio/mean': 1.0000131130218506, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.4458003679228568e-05, 'epoch': 0.07}
+
+  7%|▋         | 71/1024 [2:48:08<40:12:04, 151.86s/it][AINFO 11-30 22:19:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:19:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:19:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:19:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 72/1024 [2:50:37<39:52:31, 150.79s/it][A
+                                                       [A{'loss': -0.0001, 'grad_norm': 0.0022644686978310347, 'learning_rate': 1e-05, 'num_tokens': 52995478.0, 'completions/mean_length': 4928.7265625, 'completions/min_length': 9.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4838.52734375, 'completions/min_terminated_length': 9.0, 'completions/max_terminated_length': 14271.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01846414990723133, 'sampling/sampling_logp_difference/max': 2.8144047260284424, 'sampling/importance_sampling_ratio/min': 0.0599403902888298, 'sampling/importance_sampling_ratio/mean': 0.9999539256095886, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.91908019487164e-05, 'epoch': 0.07}
+
+  7%|▋         | 72/1024 [2:50:37<39:52:31, 150.79s/it][AINFO 11-30 22:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:21:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:21:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 73/1024 [2:52:49<38:20:07, 145.12s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0022036891896277666, 'learning_rate': 1e-05, 'num_tokens': 53568694.0, 'completions/mean_length': 4337.5, 'completions/min_length': 397.0, 'completions/max_length': 15862.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4337.5, 'completions/min_terminated_length': 397.0, 'completions/max_terminated_length': 15862.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.35878273844718933, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017469648271799088, 'sampling/sampling_logp_difference/max': 1.562251091003418, 'sampling/importance_sampling_ratio/min': 0.20966355502605438, 'sampling/importance_sampling_ratio/mean': 0.9999600648880005, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.215770076778426e-05, 'epoch': 0.07}
+
+  7%|▋         | 73/1024 [2:52:49<38:20:07, 145.12s/it][AINFO 11-30 22:24:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:24:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:24:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:24:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 74/1024 [2:55:16<38:28:40, 145.81s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002367198932915926, 'learning_rate': 1e-05, 'num_tokens': 54291136.0, 'completions/mean_length': 5497.140625, 'completions/min_length': 578.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5054.5849609375, 'completions/min_terminated_length': 578.0, 'completions/max_terminated_length': 14342.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3077537715435028, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016853079199790955, 'sampling/sampling_logp_difference/max': 2.6654253005981445, 'sampling/importance_sampling_ratio/min': 0.06956975907087326, 'sampling/importance_sampling_ratio/mean': 1.000022530555725, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3844350077270065e-05, 'epoch': 0.07}
+
+  7%|▋         | 74/1024 [2:55:16<38:28:40, 145.81s/it][AINFO 11-30 22:26:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:26:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:26:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:26:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 75/1024 [2:57:38<38:09:39, 144.76s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0011786138638854027, 'learning_rate': 1e-05, 'num_tokens': 55042987.0, 'completions/mean_length': 5719.3984375, 'completions/min_length': 123.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5635.42529296875, 'completions/min_terminated_length': 123.0, 'completions/max_terminated_length': 15926.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.18542881309986115, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020988646894693375, 'sampling/sampling_logp_difference/max': 2.303022623062134, 'sampling/importance_sampling_ratio/min': 0.09995625913143158, 'sampling/importance_sampling_ratio/mean': 1.0000141859054565, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.747379491869651e-05, 'epoch': 0.07}
+
+  7%|▋         | 75/1024 [2:57:38<38:09:39, 144.76s/it][AINFO 11-30 22:28:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:28:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:28:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:28:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  7%|▋         | 76/1024 [3:00:12<38:49:34, 147.44s/it][A
+                                                       [A{'loss': 0.0001, 'grad_norm': 0.006951675284653902, 'learning_rate': 1e-05, 'num_tokens': 55866935.0, 'completions/mean_length': 6302.90625, 'completions/min_length': 576.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6223.52734375, 'completions/min_terminated_length': 576.0, 'completions/max_terminated_length': 15473.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.400318443775177, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018844101577997208, 'sampling/sampling_logp_difference/max': 7.4210524559021, 'sampling/importance_sampling_ratio/min': 0.0005985188763588667, 'sampling/importance_sampling_ratio/mean': 0.9999681711196899, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.193879155740433e-05, 'epoch': 0.07}
+
+  7%|▋         | 76/1024 [3:00:12<38:49:34, 147.44s/it][AINFO 11-30 22:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:31:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:31:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 77/1024 [3:02:31<38:07:12, 144.91s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0021795877255499363, 'learning_rate': 1e-05, 'num_tokens': 56560842.0, 'completions/mean_length': 5276.7109375, 'completions/min_length': 250.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5100.4052734375, 'completions/min_terminated_length': 250.0, 'completions/max_terminated_length': 14063.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.24381661415100098, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020101184025406837, 'sampling/sampling_logp_difference/max': 6.540733337402344, 'sampling/importance_sampling_ratio/min': 0.0014434296172112226, 'sampling/importance_sampling_ratio/mean': 0.9999696016311646, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.650080566330871e-05, 'epoch': 0.07}
+
+  8%|▊         | 77/1024 [3:02:31<38:07:12, 144.91s/it][AINFO 11-30 22:33:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:33:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:33:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:33:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 78/1024 [3:04:59<38:17:50, 145.74s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0024005803279578686, 'learning_rate': 1e-05, 'num_tokens': 57304402.0, 'completions/mean_length': 5665.0, 'completions/min_length': 431.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5407.744140625, 'completions/min_terminated_length': 431.0, 'completions/max_terminated_length': 16231.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2635546922683716, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018604082986712456, 'sampling/sampling_logp_difference/max': 14.277615547180176, 'sampling/importance_sampling_ratio/min': 6.299562187450647e-07, 'sampling/importance_sampling_ratio/mean': 0.9999913573265076, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.172804656514927e-05, 'epoch': 0.07}
+
+  8%|▊         | 78/1024 [3:04:59<38:17:50, 145.74s/it][AINFO 11-30 22:36:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:36:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:36:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:36:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 79/1024 [3:07:34<38:58:15, 148.46s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.002356703858822584, 'learning_rate': 1e-05, 'num_tokens': 58115921.0, 'completions/mean_length': 6189.4296875, 'completions/min_length': 590.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6109.1572265625, 'completions/min_terminated_length': 590.0, 'completions/max_terminated_length': 15832.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3316730856895447, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019039709120988846, 'sampling/sampling_logp_difference/max': 1.9228436946868896, 'sampling/importance_sampling_ratio/min': 0.17184996604919434, 'sampling/importance_sampling_ratio/mean': 1.0000288486480713, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.8025341754964757e-05, 'epoch': 0.07}
+
+  8%|▊         | 79/1024 [3:07:34<38:58:15, 148.46s/it][AINFO 11-30 22:38:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:38:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:38:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:38:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 80/1024 [3:09:52<38:06:22, 145.32s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.004188979510217905, 'learning_rate': 1e-05, 'num_tokens': 58815147.0, 'completions/mean_length': 5330.078125, 'completions/min_length': 324.0, 'completions/max_length': 16288.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5330.078125, 'completions/min_terminated_length': 324.0, 'completions/max_terminated_length': 16288.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3582410514354706, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019644031301140785, 'sampling/sampling_logp_difference/max': 9.566600799560547, 'sampling/importance_sampling_ratio/min': 7.002902566455305e-05, 'sampling/importance_sampling_ratio/mean': 1.000070333480835, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1143317275782465e-05, 'epoch': 0.07}
+
+  8%|▊         | 80/1024 [3:09:52<38:06:22, 145.32s/it][AINFO 11-30 22:41:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:41:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:41:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:41:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 81/1024 [3:12:19<38:13:32, 145.93s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002031494863331318, 'learning_rate': 1e-05, 'num_tokens': 59546886.0, 'completions/mean_length': 5563.9609375, 'completions/min_length': 301.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5478.763671875, 'completions/min_terminated_length': 301.0, 'completions/max_terminated_length': 14266.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.2188364714384079, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018099917098879814, 'sampling/sampling_logp_difference/max': 1.8323705196380615, 'sampling/importance_sampling_ratio/min': 0.16003374755382538, 'sampling/importance_sampling_ratio/mean': 1.0000100135803223, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.325555892137345e-05, 'epoch': 0.07}
+
+  8%|▊         | 81/1024 [3:12:19<38:13:32, 145.93s/it][AINFO 11-30 22:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:43:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:43:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 82/1024 [3:13:56<34:19:50, 131.20s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002914588898420334, 'learning_rate': 1e-05, 'num_tokens': 60078495.0, 'completions/mean_length': 3935.6953125, 'completions/min_length': 60.0, 'completions/max_length': 11977.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3935.6953125, 'completions/min_terminated_length': 60.0, 'completions/max_terminated_length': 11977.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.24040168523788452, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01803525537252426, 'sampling/sampling_logp_difference/max': 2.698113203048706, 'sampling/importance_sampling_ratio/min': 0.0673324316740036, 'sampling/importance_sampling_ratio/mean': 1.000085473060608, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.103921583009651e-05, 'epoch': 0.08}
+
+  8%|▊         | 82/1024 [3:13:56<34:19:50, 131.20s/it][AINFO 11-30 22:45:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:45:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:45:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:45:12 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 83/1024 [3:15:50<32:57:43, 126.10s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0009446697076782584, 'learning_rate': 1e-05, 'num_tokens': 60728490.0, 'completions/mean_length': 4930.0859375, 'completions/min_length': 105.0, 'completions/max_length': 13500.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4930.0859375, 'completions/min_terminated_length': 105.0, 'completions/max_terminated_length': 13500.0, 'rewards/accuracy_reward/mean': 0.21875, 'rewards/accuracy_reward/std': 0.41502299904823303, 'reward': 0.21875, 'reward_std': 0.2811809182167053, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018142428249120712, 'sampling/sampling_logp_difference/max': 6.101896286010742, 'sampling/importance_sampling_ratio/min': 0.002238618675619364, 'sampling/importance_sampling_ratio/mean': 1.000023603439331, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.686074635174009e-05, 'epoch': 0.08}
+
+  8%|▊         | 83/1024 [3:15:50<32:57:43, 126.10s/it][AINFO 11-30 22:47:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:47:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:47:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:47:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 84/1024 [3:18:11<34:04:25, 130.49s/it][A
+                                                       [A{'loss': 0.0001, 'grad_norm': 0.0052177440375089645, 'learning_rate': 1e-05, 'num_tokens': 61454759.0, 'completions/mean_length': 5476.0390625, 'completions/min_length': 369.0, 'completions/max_length': 13983.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5476.0390625, 'completions/min_terminated_length': 369.0, 'completions/max_terminated_length': 13983.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017880255356431007, 'sampling/sampling_logp_difference/max': 2.1576435565948486, 'sampling/importance_sampling_ratio/min': 0.3090202808380127, 'sampling/importance_sampling_ratio/mean': 0.9999613761901855, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.544730416000675e-05, 'epoch': 0.08}
+
+  8%|▊         | 84/1024 [3:18:11<34:04:25, 130.49s/it][AINFO 11-30 22:49:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:49:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:49:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:49:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 85/1024 [3:20:22<34:07:31, 130.83s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002467979211360216, 'learning_rate': 1e-05, 'num_tokens': 62129486.0, 'completions/mean_length': 5121.8671875, 'completions/min_length': 206.0, 'completions/max_length': 14047.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5121.8671875, 'completions/min_terminated_length': 206.0, 'completions/max_terminated_length': 14047.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01872030273079872, 'sampling/sampling_logp_difference/max': 7.56773567199707, 'sampling/importance_sampling_ratio/min': 0.0005168615025468171, 'sampling/importance_sampling_ratio/mean': 0.9999390244483948, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.063482399236818e-05, 'epoch': 0.08}
+
+  8%|▊         | 85/1024 [3:20:22<34:07:31, 130.83s/it][AINFO 11-30 22:51:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:51:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:51:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:51:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 86/1024 [3:23:04<36:29:10, 140.03s/it][A
+                                                       [A{'loss': 0.0001, 'grad_norm': 0.00976498518139124, 'learning_rate': 1e-05, 'num_tokens': 62993964.0, 'completions/mean_length': 6593.359375, 'completions/min_length': 260.0, 'completions/max_length': 16273.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6593.359375, 'completions/min_terminated_length': 260.0, 'completions/max_terminated_length': 16273.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019950274378061295, 'sampling/sampling_logp_difference/max': 4.664278030395508, 'sampling/importance_sampling_ratio/min': 0.009426050819456577, 'sampling/importance_sampling_ratio/mean': 0.9998830556869507, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.715064462994633e-05, 'epoch': 0.08}
+
+  8%|▊         | 86/1024 [3:23:04<36:29:10, 140.03s/it][AINFO 11-30 22:54:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:54:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:54:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:54:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  8%|▊         | 87/1024 [3:25:21<36:11:40, 139.06s/it][A
+                                                       [A{'loss': 0.0001, 'grad_norm': 0.0018940618028864264, 'learning_rate': 1e-05, 'num_tokens': 63675606.0, 'completions/mean_length': 5177.453125, 'completions/min_length': 93.0, 'completions/max_length': 14964.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5177.453125, 'completions/min_terminated_length': 93.0, 'completions/max_terminated_length': 14964.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02066379226744175, 'sampling/sampling_logp_difference/max': 1.6505475044250488, 'sampling/importance_sampling_ratio/min': 0.19194479286670685, 'sampling/importance_sampling_ratio/mean': 0.9999979138374329, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.226940111744625e-05, 'epoch': 0.08}
+
+  8%|▊         | 87/1024 [3:25:21<36:11:40, 139.06s/it][AINFO 11-30 22:56:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:56:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:56:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:56:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▊         | 88/1024 [3:27:53<37:10:38, 142.99s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0018224245868623257, 'learning_rate': 1e-05, 'num_tokens': 64527281.0, 'completions/mean_length': 6500.5234375, 'completions/min_length': 623.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6422.70068359375, 'completions/min_terminated_length': 623.0, 'completions/max_terminated_length': 15723.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.322716623544693, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01952294446527958, 'sampling/sampling_logp_difference/max': 1.6685032844543457, 'sampling/importance_sampling_ratio/min': 0.27177199721336365, 'sampling/importance_sampling_ratio/mean': 1.0000207424163818, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.741156494172174e-05, 'epoch': 0.08}
+
+  9%|▊         | 88/1024 [3:27:53<37:10:38, 142.99s/it][AINFO 11-30 22:59:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:59:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:59:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 22:59:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▊         | 89/1024 [3:29:40<34:19:16, 132.15s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0018940462032333016, 'learning_rate': 1e-05, 'num_tokens': 65219001.0, 'completions/mean_length': 5251.75, 'completions/min_length': 530.0, 'completions/max_length': 11434.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5251.75, 'completions/min_terminated_length': 530.0, 'completions/max_terminated_length': 11434.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01754143089056015, 'sampling/sampling_logp_difference/max': 3.0306174755096436, 'sampling/importance_sampling_ratio/min': 0.04828581213951111, 'sampling/importance_sampling_ratio/mean': 1.0000126361846924, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.615755886381521e-05, 'epoch': 0.08}
+
+  9%|▊         | 89/1024 [3:29:40<34:19:16, 132.15s/it][AINFO 11-30 23:00:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:00:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:00:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:00:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 90/1024 [3:31:44<33:41:54, 129.89s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0030503366142511368, 'learning_rate': 1e-05, 'num_tokens': 65854710.0, 'completions/mean_length': 4803.4765625, 'completions/min_length': 377.0, 'completions/max_length': 15204.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4803.4765625, 'completions/min_terminated_length': 377.0, 'completions/max_terminated_length': 15204.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.30168038606643677, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01917651668190956, 'sampling/sampling_logp_difference/max': 1.7813243865966797, 'sampling/importance_sampling_ratio/min': 0.16841496527194977, 'sampling/importance_sampling_ratio/mean': 1.0000015497207642, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0087076879681263e-05, 'epoch': 0.08}
+
+  9%|▉         | 90/1024 [3:31:44<33:41:54, 129.89s/it][AINFO 11-30 23:03:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:03:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:03:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:03:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 91/1024 [3:34:18<35:32:17, 137.12s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0015823139110580087, 'learning_rate': 1e-05, 'num_tokens': 66652483.0, 'completions/mean_length': 6030.8515625, 'completions/min_length': 140.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5521.68017578125, 'completions/min_terminated_length': 140.0, 'completions/max_terminated_length': 14537.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.2790592312812805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019791461527347565, 'sampling/sampling_logp_difference/max': 11.730330467224121, 'sampling/importance_sampling_ratio/min': 8.04604042059509e-06, 'sampling/importance_sampling_ratio/mean': 0.9999071955680847, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.327589701664692e-05, 'epoch': 0.08}
+
+  9%|▉         | 91/1024 [3:34:18<35:32:17, 137.12s/it][AINFO 11-30 23:05:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:05:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:05:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:05:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 92/1024 [3:36:57<37:11:15, 143.64s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.002732198918238282, 'learning_rate': 1e-05, 'num_tokens': 67424595.0, 'completions/mean_length': 5874.0, 'completions/min_length': 91.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5621.76025390625, 'completions/min_terminated_length': 91.0, 'completions/max_terminated_length': 16157.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.3248383104801178, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019318202510476112, 'sampling/sampling_logp_difference/max': 3.242915630340576, 'sampling/importance_sampling_ratio/min': 0.03904987499117851, 'sampling/importance_sampling_ratio/mean': 1.0000081062316895, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.902479981363285e-05, 'epoch': 0.08}
+
+  9%|▉         | 92/1024 [3:36:57<37:11:15, 143.64s/it][AINFO 11-30 23:08:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:08:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:08:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:08:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 93/1024 [3:39:04<35:51:29, 138.66s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0023166979663074017, 'learning_rate': 1e-05, 'num_tokens': 68174258.0, 'completions/mean_length': 5673.7421875, 'completions/min_length': 523.0, 'completions/max_length': 13427.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5673.7421875, 'completions/min_terminated_length': 523.0, 'completions/max_terminated_length': 13427.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3284856677055359, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019939376041293144, 'sampling/sampling_logp_difference/max': 2.2830207347869873, 'sampling/importance_sampling_ratio/min': 0.1019756942987442, 'sampling/importance_sampling_ratio/mean': 0.9999707937240601, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.803286131187633e-05, 'epoch': 0.09}
+
+  9%|▉         | 93/1024 [3:39:04<35:51:29, 138.66s/it][AINFO 11-30 23:10:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:10:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:10:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:10:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 94/1024 [3:41:30<36:21:09, 140.72s/it][A
+                                                       [A{'loss': -0.0001, 'grad_norm': 0.002618401311337948, 'learning_rate': 1e-05, 'num_tokens': 68847500.0, 'completions/mean_length': 5107.578125, 'completions/min_length': 560.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5018.78759765625, 'completions/min_terminated_length': 560.0, 'completions/max_terminated_length': 14706.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019186105579137802, 'sampling/sampling_logp_difference/max': 4.246817588806152, 'sampling/importance_sampling_ratio/min': 0.014309701509773731, 'sampling/importance_sampling_ratio/mean': 1.0000014305114746, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.812662468291819e-05, 'epoch': 0.09}
+
+  9%|▉         | 94/1024 [3:41:30<36:21:09, 140.72s/it][AINFO 11-30 23:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:12:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 95/1024 [3:43:52<36:25:36, 141.16s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.002423090860247612, 'learning_rate': 1e-05, 'num_tokens': 69596767.0, 'completions/mean_length': 5714.0859375, 'completions/min_length': 176.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5630.07080078125, 'completions/min_terminated_length': 176.0, 'completions/max_terminated_length': 16150.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.30744946002960205, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019837230443954468, 'sampling/sampling_logp_difference/max': 1.9064362049102783, 'sampling/importance_sampling_ratio/min': 0.14860905706882477, 'sampling/importance_sampling_ratio/mean': 1.0000383853912354, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.697475262422813e-05, 'epoch': 0.09}
+
+  9%|▉         | 95/1024 [3:43:52<36:25:36, 141.16s/it][AINFO 11-30 23:15:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:15:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:15:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:15:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 96/1024 [3:46:12<36:19:45, 140.93s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0029741444159299135, 'learning_rate': 1e-05, 'num_tokens': 70368962.0, 'completions/mean_length': 5892.5234375, 'completions/min_length': 321.0, 'completions/max_length': 16235.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5892.5234375, 'completions/min_terminated_length': 321.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3374421298503876, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01912284828722477, 'sampling/sampling_logp_difference/max': 1.5773849487304688, 'sampling/importance_sampling_ratio/min': 0.21426165103912354, 'sampling/importance_sampling_ratio/mean': 1.0000895261764526, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6557486232122756e-05, 'epoch': 0.09}
+
+  9%|▉         | 96/1024 [3:46:12<36:19:45, 140.93s/it][AINFO 11-30 23:17:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:17:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:17:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:17:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+  9%|▉         | 97/1024 [3:48:19<35:11:49, 136.69s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.003496806602925062, 'learning_rate': 1e-05, 'num_tokens': 70941332.0, 'completions/mean_length': 4327.640625, 'completions/min_length': 661.0, 'completions/max_length': 15076.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4327.640625, 'completions/min_terminated_length': 661.0, 'completions/max_terminated_length': 15076.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.4121658205986023, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.017024852335453033, 'sampling/sampling_logp_difference/max': 1.5312318801879883, 'sampling/importance_sampling_ratio/min': 0.287904292345047, 'sampling/importance_sampling_ratio/mean': 1.0000836849212646, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.151407498895423e-05, 'epoch': 0.09}
+
+  9%|▉         | 97/1024 [3:48:19<35:11:49, 136.69s/it][AINFO 11-30 23:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:19:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:19:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|▉         | 98/1024 [3:50:45<35:54:52, 139.63s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.00229308707639575, 'learning_rate': 1e-05, 'num_tokens': 71717328.0, 'completions/mean_length': 5904.90625, 'completions/min_length': 54.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5822.3935546875, 'completions/min_terminated_length': 54.0, 'completions/max_terminated_length': 15021.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3369230031967163, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01896141842007637, 'sampling/sampling_logp_difference/max': 2.597864866256714, 'sampling/importance_sampling_ratio/min': 0.07443232834339142, 'sampling/importance_sampling_ratio/mean': 1.0000367164611816, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.280609002511483e-05, 'epoch': 0.09}
+
+ 10%|▉         | 98/1024 [3:50:45<35:54:52, 139.63s/it][AINFO 11-30 23:22:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:22:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:22:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:22:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|▉         | 99/1024 [3:52:58<35:19:49, 137.50s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0024638224858790636, 'learning_rate': 1e-05, 'num_tokens': 72280867.0, 'completions/mean_length': 4262.9609375, 'completions/min_length': 230.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4167.51953125, 'completions/min_terminated_length': 230.0, 'completions/max_terminated_length': 15741.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.34245961904525757, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01848158985376358, 'sampling/sampling_logp_difference/max': 1.783703088760376, 'sampling/importance_sampling_ratio/min': 0.16801482439041138, 'sampling/importance_sampling_ratio/mean': 1.0000083446502686, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.993454799910978e-05, 'epoch': 0.09}
+
+ 10%|▉         | 99/1024 [3:52:58<35:19:49, 137.50s/it][AINFO 11-30 23:24:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:24:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:24:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:24:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|▉         | 100/1024 [3:54:56<33:47:34, 131.66s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002213232684880495, 'learning_rate': 1e-05, 'num_tokens': 72916082.0, 'completions/mean_length': 4811.8046875, 'completions/min_length': 404.0, 'completions/max_length': 14249.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4811.8046875, 'completions/min_terminated_length': 404.0, 'completions/max_terminated_length': 14249.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.40609243512153625, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01737941801548004, 'sampling/sampling_logp_difference/max': 3.955639123916626, 'sampling/importance_sampling_ratio/min': 0.019146427512168884, 'sampling/importance_sampling_ratio/mean': 0.9999799728393555, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5062820845487295e-05, 'epoch': 0.09}
+
+ 10%|▉         | 100/1024 [3:54:56<33:47:34, 131.66s/it][AINFO 11-30 23:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:26:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|▉         | 101/1024 [3:57:21<34:48:16, 135.75s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002592389704659581, 'learning_rate': 1e-05, 'num_tokens': 73593116.0, 'completions/mean_length': 5140.828125, 'completions/min_length': 274.0, 'completions/max_length': 15724.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5140.828125, 'completions/min_terminated_length': 274.0, 'completions/max_terminated_length': 15724.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2614428400993347, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017767824232578278, 'sampling/sampling_logp_difference/max': 2.5789036750793457, 'sampling/importance_sampling_ratio/min': 0.07585711777210236, 'sampling/importance_sampling_ratio/mean': 1.0000168085098267, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.02272522074054e-05, 'epoch': 0.09}
+
+ 10%|▉         | 101/1024 [3:57:21<34:48:16, 135.75s/it][AINFO 11-30 23:28:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:28:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:28:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:28:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|▉         | 102/1024 [3:59:38<34:51:52, 136.13s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0017981899436563253, 'learning_rate': 1e-05, 'num_tokens': 74340995.0, 'completions/mean_length': 5688.7421875, 'completions/min_length': 523.0, 'completions/max_length': 14583.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5688.7421875, 'completions/min_terminated_length': 523.0, 'completions/max_terminated_length': 14583.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.26303064823150635, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019012723118066788, 'sampling/sampling_logp_difference/max': 1.9495733976364136, 'sampling/importance_sampling_ratio/min': 0.14233477413654327, 'sampling/importance_sampling_ratio/mean': 0.9999580383300781, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.490121898470534e-05, 'epoch': 0.09}
+
+ 10%|▉         | 102/1024 [3:59:38<34:51:52, 136.13s/it][AINFO 11-30 23:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:30:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|█         | 103/1024 [4:01:52<34:40:15, 135.52s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.004704791121184826, 'learning_rate': 1e-05, 'num_tokens': 75001621.0, 'completions/mean_length': 5013.828125, 'completions/min_length': 588.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4924.29931640625, 'completions/min_terminated_length': 588.0, 'completions/max_terminated_length': 14673.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020504780113697052, 'sampling/sampling_logp_difference/max': 6.123542785644531, 'sampling/importance_sampling_ratio/min': 0.0021906811743974686, 'sampling/importance_sampling_ratio/mean': 1.0000085830688477, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.968414105453121e-05, 'epoch': 0.09}
+
+ 10%|█         | 103/1024 [4:01:52<34:40:15, 135.52s/it][AINFO 11-30 23:33:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:33:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:33:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:33:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|█         | 104/1024 [4:04:32<36:27:03, 142.63s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002013104734942317, 'learning_rate': 1e-05, 'num_tokens': 75922461.0, 'completions/mean_length': 7011.0625, 'completions/min_length': 731.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6786.1123046875, 'completions/min_terminated_length': 731.0, 'completions/max_terminated_length': 16360.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019036103039979935, 'sampling/sampling_logp_difference/max': 3.155829429626465, 'sampling/importance_sampling_ratio/min': 0.04260304942727089, 'sampling/importance_sampling_ratio/mean': 0.9999270439147949, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.700972224076395e-05, 'epoch': 0.1}
+
+ 10%|█         | 104/1024 [4:04:32<36:27:03, 142.63s/it][AINFO 11-30 23:35:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:35:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:35:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:35:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|█         | 105/1024 [4:07:00<36:49:44, 144.27s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019418720621615648, 'learning_rate': 1e-05, 'num_tokens': 76668271.0, 'completions/mean_length': 5670.765625, 'completions/min_length': 741.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5586.40966796875, 'completions/min_terminated_length': 741.0, 'completions/max_terminated_length': 16197.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3766237497329712, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020093616098165512, 'sampling/sampling_logp_difference/max': 3.185605049133301, 'sampling/importance_sampling_ratio/min': 0.041353218257427216, 'sampling/importance_sampling_ratio/mean': 1.0000053644180298, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5571251373767154e-05, 'epoch': 0.1}
+
+ 10%|█         | 105/1024 [4:07:00<36:49:44, 144.27s/it][AINFO 11-30 23:38:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:38:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:38:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:38:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|█         | 106/1024 [4:09:02<35:05:13, 137.60s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0040908739902079105, 'learning_rate': 1e-05, 'num_tokens': 77323536.0, 'completions/mean_length': 4968.6328125, 'completions/min_length': 712.0, 'completions/max_length': 13319.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4968.6328125, 'completions/min_terminated_length': 712.0, 'completions/max_terminated_length': 13319.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2659186124801636, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01808089017868042, 'sampling/sampling_logp_difference/max': 8.7059965133667, 'sampling/importance_sampling_ratio/min': 0.00016558986681047827, 'sampling/importance_sampling_ratio/mean': 1.0000154972076416, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4835569408642186e-05, 'epoch': 0.1}
+
+ 10%|█         | 106/1024 [4:09:02<35:05:13, 137.60s/it][AINFO 11-30 23:40:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:40:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:40:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:40:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 10%|█         | 107/1024 [4:11:20<35:07:26, 137.89s/it][A
+                                                        [A{'loss': -0.0001, 'grad_norm': 0.0015432675136253238, 'learning_rate': 1e-05, 'num_tokens': 78077173.0, 'completions/mean_length': 5751.7265625, 'completions/min_length': 146.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5496.55224609375, 'completions/min_terminated_length': 146.0, 'completions/max_terminated_length': 14858.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020379889756441116, 'sampling/sampling_logp_difference/max': 4.159396171569824, 'sampling/importance_sampling_ratio/min': 0.01561698503792286, 'sampling/importance_sampling_ratio/mean': 0.9999821186065674, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.317298064255738e-05, 'epoch': 0.1}
+
+ 10%|█         | 107/1024 [4:11:20<35:07:26, 137.89s/it][AINFO 11-30 23:42:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:42:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:42:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:42:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 108/1024 [4:13:24<33:58:17, 133.51s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0026939096860587597, 'learning_rate': 1e-05, 'num_tokens': 78730508.0, 'completions/mean_length': 4954.4296875, 'completions/min_length': 588.0, 'completions/max_length': 14858.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4954.4296875, 'completions/min_terminated_length': 588.0, 'completions/max_terminated_length': 14858.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019379660487174988, 'sampling/sampling_logp_difference/max': 2.1853394508361816, 'sampling/importance_sampling_ratio/min': 0.11243956536054611, 'sampling/importance_sampling_ratio/mean': 1.0000407695770264, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1087313206844556e-05, 'epoch': 0.1}
+
+ 11%|█         | 108/1024 [4:13:24<33:58:17, 133.51s/it][AINFO 11-30 23:44:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:44:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:44:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:44:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 109/1024 [4:15:43<34:21:58, 135.21s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0021755006164312363, 'learning_rate': 1e-05, 'num_tokens': 79234432.0, 'completions/mean_length': 3793.40625, 'completions/min_length': 334.0, 'completions/max_length': 15147.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 3793.40625, 'completions/min_terminated_length': 334.0, 'completions/max_terminated_length': 15147.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016289927065372467, 'sampling/sampling_logp_difference/max': 2.470442771911621, 'sampling/importance_sampling_ratio/min': 0.08454741537570953, 'sampling/importance_sampling_ratio/mean': 0.9999761581420898, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.241210967848019e-05, 'epoch': 0.1}
+
+ 11%|█         | 109/1024 [4:15:43<34:21:58, 135.21s/it][AINFO 11-30 23:46:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:46:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:46:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:46:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 110/1024 [4:18:08<35:06:26, 138.28s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.005381665658205748, 'learning_rate': 1e-05, 'num_tokens': 79922298.0, 'completions/mean_length': 5205.328125, 'completions/min_length': 36.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5117.30712890625, 'completions/min_terminated_length': 36.0, 'completions/max_terminated_length': 15459.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.29036492109298706, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018831558525562286, 'sampling/sampling_logp_difference/max': 2.5375452041625977, 'sampling/importance_sampling_ratio/min': 0.07906024158000946, 'sampling/importance_sampling_ratio/mean': 1.000024437904358, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.9903061178847565e-05, 'epoch': 0.1}
+
+ 11%|█         | 110/1024 [4:18:08<35:06:26, 138.28s/it][AINFO 11-30 23:49:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:49:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:49:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:49:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 111/1024 [4:20:34<35:37:06, 140.45s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002991379238665104, 'learning_rate': 1e-05, 'num_tokens': 80663678.0, 'completions/mean_length': 5648.46875, 'completions/min_length': 124.0, 'completions/max_length': 15716.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5648.46875, 'completions/min_terminated_length': 124.0, 'completions/max_terminated_length': 15716.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.265913724899292, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020358851179480553, 'sampling/sampling_logp_difference/max': 1.4857721328735352, 'sampling/importance_sampling_ratio/min': 0.22632752358913422, 'sampling/importance_sampling_ratio/mean': 0.9999160766601562, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4829319702112116e-05, 'epoch': 0.1}
+
+ 11%|█         | 111/1024 [4:20:34<35:37:06, 140.45s/it][AINFO 11-30 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:51:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 112/1024 [4:23:10<36:48:36, 145.30s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002751540159806609, 'learning_rate': 1e-05, 'num_tokens': 81532875.0, 'completions/mean_length': 6625.1015625, 'completions/min_length': 731.0, 'completions/max_length': 16123.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6625.1015625, 'completions/min_terminated_length': 731.0, 'completions/max_terminated_length': 16123.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.22832970321178436, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018914129585027695, 'sampling/sampling_logp_difference/max': 4.251466274261475, 'sampling/importance_sampling_ratio/min': 0.014243333600461483, 'sampling/importance_sampling_ratio/mean': 0.9999794960021973, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.0715250495159125e-05, 'epoch': 0.1}
+
+ 11%|█         | 112/1024 [4:23:10<36:48:36, 145.30s/it][AINFO 11-30 23:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:54:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:54:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 113/1024 [4:25:44<37:22:49, 147.72s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0024358206428587437, 'learning_rate': 1e-05, 'num_tokens': 82341844.0, 'completions/mean_length': 6139.8203125, 'completions/min_length': 123.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5893.96044921875, 'completions/min_terminated_length': 123.0, 'completions/max_terminated_length': 16265.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.30221226811408997, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01934763416647911, 'sampling/sampling_logp_difference/max': 2.19158935546875, 'sampling/importance_sampling_ratio/min': 0.11173901706933975, 'sampling/importance_sampling_ratio/mean': 0.9999080896377563, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.596127882905421e-05, 'epoch': 0.1}
+
+ 11%|█         | 113/1024 [4:25:44<37:22:49, 147.72s/it][AINFO 11-30 23:57:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:57:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:57:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:57:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 114/1024 [4:28:22<38:09:51, 150.98s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.005570736713707447, 'learning_rate': 1e-05, 'num_tokens': 83054548.0, 'completions/mean_length': 5419.5625, 'completions/min_length': 633.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5245.52392578125, 'completions/min_terminated_length': 633.0, 'completions/max_terminated_length': 15867.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2637920379638672, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016997840255498886, 'sampling/sampling_logp_difference/max': 1.9162261486053467, 'sampling/importance_sampling_ratio/min': 0.14716127514839172, 'sampling/importance_sampling_ratio/mean': 1.0000163316726685, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1299435995133535e-05, 'epoch': 0.1}
+
+ 11%|█         | 114/1024 [4:28:22<38:09:51, 150.98s/it][AINFO 11-30 23:59:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:59:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:59:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 11-30 23:59:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█         | 115/1024 [4:30:52<38:01:15, 150.58s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.004660828970372677, 'learning_rate': 1e-05, 'num_tokens': 83717984.0, 'completions/mean_length': 5047.28125, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4867.33349609375, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15521.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3527044355869293, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017476122826337814, 'sampling/sampling_logp_difference/max': 1.705055832862854, 'sampling/importance_sampling_ratio/min': 0.181762233376503, 'sampling/importance_sampling_ratio/mean': 0.9999426007270813, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.6087629534195e-05, 'epoch': 0.11}
+
+ 11%|█         | 115/1024 [4:30:52<38:01:15, 150.58s/it][AINFO 12-01 00:02:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:02:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:02:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:02:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█▏        | 116/1024 [4:33:12<37:10:54, 147.42s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017224401235580444, 'learning_rate': 1e-05, 'num_tokens': 84473630.0, 'completions/mean_length': 5741.609375, 'completions/min_length': 522.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5572.68310546875, 'completions/min_terminated_length': 522.0, 'completions/max_terminated_length': 15662.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.1922685205936432, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02057701162993908, 'sampling/sampling_logp_difference/max': 2.608762741088867, 'sampling/importance_sampling_ratio/min': 0.0736255794763565, 'sampling/importance_sampling_ratio/mean': 0.999947190284729, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.40556723838381e-05, 'epoch': 0.11}
+
+ 11%|█▏        | 116/1024 [4:33:12<37:10:54, 147.42s/it][AINFO 12-01 00:04:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:04:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:04:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:04:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 11%|█▏        | 117/1024 [4:36:17<39:58:42, 158.68s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0024027025792747736, 'learning_rate': 1e-05, 'num_tokens': 85412948.0, 'completions/mean_length': 7186.296875, 'completions/min_length': 754.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6812.40625, 'completions/min_terminated_length': 754.0, 'completions/max_terminated_length': 15929.0, 'rewards/accuracy_reward/mean': 0.2109375, 'rewards/accuracy_reward/std': 0.4095771610736847, 'reward': 0.2109375, 'reward_std': 0.2330428510904312, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02117101103067398, 'sampling/sampling_logp_difference/max': 12.068208694458008, 'sampling/importance_sampling_ratio/min': 5.739096650358988e-06, 'sampling/importance_sampling_ratio/mean': 0.9999825954437256, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.92931712869904e-05, 'epoch': 0.11}
+
+ 11%|█▏        | 117/1024 [4:36:17<39:58:42, 158.68s/it][AINFO 12-01 00:07:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:07:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:07:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:07:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 118/1024 [4:38:31<38:03:37, 151.23s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018196414457634091, 'learning_rate': 1e-05, 'num_tokens': 86048425.0, 'completions/mean_length': 4788.0390625, 'completions/min_length': 471.0, 'completions/max_length': 16188.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4788.0390625, 'completions/min_terminated_length': 471.0, 'completions/max_terminated_length': 16188.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019435198977589607, 'sampling/sampling_logp_difference/max': 3.7541348934173584, 'sampling/importance_sampling_ratio/min': 0.023420704528689384, 'sampling/importance_sampling_ratio/mean': 1.000002145767212, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.7468777716421755e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 118/1024 [4:38:31<38:03:37, 151.23s/it][AINFO 12-01 00:09:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:09:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:09:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:09:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 119/1024 [4:41:08<38:25:43, 152.87s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0029167805332690477, 'learning_rate': 1e-05, 'num_tokens': 86912416.0, 'completions/mean_length': 6600.6171875, 'completions/min_length': 501.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6445.32568359375, 'completions/min_terminated_length': 501.0, 'completions/max_terminated_length': 16119.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3319055438041687, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018772078678011894, 'sampling/sampling_logp_difference/max': 2.960973024368286, 'sampling/importance_sampling_ratio/min': 0.05176852270960808, 'sampling/importance_sampling_ratio/mean': 1.0000026226043701, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.4978657342653605e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 119/1024 [4:41:08<38:25:43, 152.87s/it][AINFO 12-01 00:12:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:12:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:12:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:12:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 120/1024 [4:43:20<36:48:55, 146.61s/it][A
+                                                        [A{'loss': -0.0001, 'grad_norm': 0.0022553224116563797, 'learning_rate': 1e-05, 'num_tokens': 87612196.0, 'completions/mean_length': 5287.53125, 'completions/min_length': 4.0, 'completions/max_length': 14197.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5287.53125, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 14197.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01811406761407852, 'sampling/sampling_logp_difference/max': 1.6773791313171387, 'sampling/importance_sampling_ratio/min': 0.21039076149463654, 'sampling/importance_sampling_ratio/mean': 0.9999818801879883, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.170931779550301e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 120/1024 [4:43:20<36:48:55, 146.61s/it][AINFO 12-01 00:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:14:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 121/1024 [4:46:01<37:55:15, 151.18s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0016515000024810433, 'learning_rate': 1e-05, 'num_tokens': 88509254.0, 'completions/mean_length': 6852.578125, 'completions/min_length': 229.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6383.8193359375, 'completions/min_terminated_length': 229.0, 'completions/max_terminated_length': 15597.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2761634290218353, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019428953528404236, 'sampling/sampling_logp_difference/max': 1.5338797569274902, 'sampling/importance_sampling_ratio/min': 0.21569719910621643, 'sampling/importance_sampling_ratio/mean': 0.9999850988388062, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.479192623170093e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 121/1024 [4:46:01<37:55:15, 151.18s/it][AINFO 12-01 00:17:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:17:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:17:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:17:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 122/1024 [4:48:42<38:35:03, 154.00s/it][A
+                                                        [A{'loss': -0.0001, 'grad_norm': 0.002076901262626052, 'learning_rate': 1e-05, 'num_tokens': 89304684.0, 'completions/mean_length': 6054.359375, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5806.4482421875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15247.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.24670752882957458, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02084452286362648, 'sampling/sampling_logp_difference/max': 14.992361068725586, 'sampling/importance_sampling_ratio/min': 3.082480475313787e-07, 'sampling/importance_sampling_ratio/mean': 0.9999532699584961, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3633140876409016e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 122/1024 [4:48:42<38:35:03, 154.00s/it][AINFO 12-01 00:19:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:19:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:19:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:19:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 123/1024 [4:51:10<38:06:31, 152.27s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0015113067347556353, 'learning_rate': 1e-05, 'num_tokens': 90168810.0, 'completions/mean_length': 6600.296875, 'completions/min_length': 681.0, 'completions/max_length': 15945.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6600.296875, 'completions/min_terminated_length': 681.0, 'completions/max_terminated_length': 15945.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01965155079960823, 'sampling/sampling_logp_difference/max': 3.7143967151641846, 'sampling/importance_sampling_ratio/min': 0.024370139464735985, 'sampling/importance_sampling_ratio/mean': 1.0000028610229492, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.038815558895294e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 123/1024 [4:51:10<38:06:31, 152.27s/it][AINFO 12-01 00:22:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:22:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:22:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:22:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 124/1024 [4:53:12<35:48:55, 143.26s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0021532925311475992, 'learning_rate': 1e-05, 'num_tokens': 90837803.0, 'completions/mean_length': 5068.3203125, 'completions/min_length': 785.0, 'completions/max_length': 12764.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5068.3203125, 'completions/min_terminated_length': 785.0, 'completions/max_terminated_length': 12764.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.18885356187820435, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01898997090756893, 'sampling/sampling_logp_difference/max': 1.7759761810302734, 'sampling/importance_sampling_ratio/min': 0.1693180799484253, 'sampling/importance_sampling_ratio/mean': 0.9999933242797852, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1740032341076585e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 124/1024 [4:53:12<35:48:55, 143.26s/it][AINFO 12-01 00:24:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:24:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:24:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:24:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 125/1024 [4:55:47<36:35:41, 146.54s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0006613060249947011, 'learning_rate': 1e-05, 'num_tokens': 91725499.0, 'completions/mean_length': 6728.1875, 'completions/min_length': 365.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6574.9208984375, 'completions/min_terminated_length': 365.0, 'completions/max_terminated_length': 15194.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.17859894037246704, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01934042200446129, 'sampling/sampling_logp_difference/max': 2.5190277099609375, 'sampling/importance_sampling_ratio/min': 0.08053787052631378, 'sampling/importance_sampling_ratio/mean': 0.9999381899833679, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7756068422822864e-05, 'epoch': 0.11}
+
+ 12%|█▏        | 125/1024 [4:55:47<36:35:41, 146.54s/it][AINFO 12-01 00:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:27:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:27:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 126/1024 [4:58:18<36:55:24, 148.02s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019437777809798717, 'learning_rate': 1e-05, 'num_tokens': 92549192.0, 'completions/mean_length': 6296.9140625, 'completions/min_length': 771.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6136.8017578125, 'completions/min_terminated_length': 771.0, 'completions/max_terminated_length': 15525.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019422942772507668, 'sampling/sampling_logp_difference/max': 9.038908004760742, 'sampling/importance_sampling_ratio/min': 0.00011870038724737242, 'sampling/importance_sampling_ratio/mean': 1.0000581741333008, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0139222758407413e-05, 'epoch': 0.12}
+
+ 12%|█▏        | 126/1024 [4:58:18<36:55:24, 148.02s/it][AINFO 12-01 00:29:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:29:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:29:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:29:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▏        | 127/1024 [5:00:59<37:49:31, 151.81s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0010669425828382373, 'learning_rate': 1e-05, 'num_tokens': 93478481.0, 'completions/mean_length': 7106.3203125, 'completions/min_length': 526.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6883.65625, 'completions/min_terminated_length': 526.0, 'completions/max_terminated_length': 15810.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.28749164938926697, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01950283721089363, 'sampling/sampling_logp_difference/max': 3.2580370903015137, 'sampling/importance_sampling_ratio/min': 0.03846382349729538, 'sampling/importance_sampling_ratio/mean': 1.000058889389038, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.60209468605899e-05, 'epoch': 0.12}
+
+ 12%|█▏        | 127/1024 [5:00:59<37:49:31, 151.81s/it][AINFO 12-01 00:32:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:32:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:32:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:32:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 12%|█▎        | 128/1024 [5:03:24<37:17:39, 149.84s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0018417143728584051, 'learning_rate': 1e-05, 'num_tokens': 94252193.0, 'completions/mean_length': 5882.9375, 'completions/min_length': 528.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5800.251953125, 'completions/min_terminated_length': 528.0, 'completions/max_terminated_length': 15676.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.379814088344574, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019602306187152863, 'sampling/sampling_logp_difference/max': 2.298842191696167, 'sampling/importance_sampling_ratio/min': 0.1003749892115593, 'sampling/importance_sampling_ratio/mean': 1.0000650882720947, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.851858807342069e-05, 'epoch': 0.12}
+
+ 12%|█▎        | 128/1024 [5:03:24<37:17:39, 149.84s/it][AINFO 12-01 00:34:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:34:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:34:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:34:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 129/1024 [5:06:08<38:19:26, 154.15s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0009487879578955472, 'learning_rate': 1e-05, 'num_tokens': 95167308.0, 'completions/mean_length': 6975.5859375, 'completions/min_length': 120.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6901.50390625, 'completions/min_terminated_length': 120.0, 'completions/max_terminated_length': 16076.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.251193106174469, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021269258111715317, 'sampling/sampling_logp_difference/max': 5.124932289123535, 'sampling/importance_sampling_ratio/min': 0.005946619901806116, 'sampling/importance_sampling_ratio/mean': 0.9999721646308899, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.860963403847563e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 129/1024 [5:06:08<38:19:26, 154.15s/it][AINFO 12-01 00:37:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:37:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:37:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:37:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 130/1024 [5:09:04<39:55:05, 160.74s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001998023595660925, 'learning_rate': 1e-05, 'num_tokens': 96103031.0, 'completions/mean_length': 7140.0859375, 'completions/min_length': 860.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6918.232421875, 'completions/min_terminated_length': 860.0, 'completions/max_terminated_length': 16300.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2858891487121582, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018622038885951042, 'sampling/sampling_logp_difference/max': 2.5895159244537354, 'sampling/importance_sampling_ratio/min': 0.07505636662244797, 'sampling/importance_sampling_ratio/mean': 0.999992847442627, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.543257651017484e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 130/1024 [5:09:04<39:55:05, 160.74s/it][AINFO 12-01 00:40:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:40:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:40:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:40:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 131/1024 [5:11:13<37:27:16, 150.99s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.004389134701341391, 'learning_rate': 1e-05, 'num_tokens': 96793730.0, 'completions/mean_length': 5262.5859375, 'completions/min_length': 348.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5175.015625, 'completions/min_terminated_length': 348.0, 'completions/max_terminated_length': 11534.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020193614065647125, 'sampling/sampling_logp_difference/max': 3.518979549407959, 'sampling/importance_sampling_ratio/min': 0.029629655182361603, 'sampling/importance_sampling_ratio/mean': 1.0000331401824951, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.7265558514955046e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 131/1024 [5:11:13<37:27:16, 150.99s/it][AINFO 12-01 00:42:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:42:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:42:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:42:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 132/1024 [5:13:48<37:42:19, 152.17s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.004154732916504145, 'learning_rate': 1e-05, 'num_tokens': 97561836.0, 'completions/mean_length': 5819.890625, 'completions/min_length': 99.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5566.35205078125, 'completions/min_terminated_length': 99.0, 'completions/max_terminated_length': 15530.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.4105731248855591, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017437271773815155, 'sampling/sampling_logp_difference/max': 8.98104190826416, 'sampling/importance_sampling_ratio/min': 0.0001257717376574874, 'sampling/importance_sampling_ratio/mean': 1.0000028610229492, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.632682066585403e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 132/1024 [5:13:48<37:42:19, 152.17s/it][AINFO 12-01 00:45:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:45:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:45:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:45:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 133/1024 [5:16:17<37:29:36, 151.49s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001528579625301063, 'learning_rate': 1e-05, 'num_tokens': 98421181.0, 'completions/mean_length': 6571.5703125, 'completions/min_length': 789.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6494.30712890625, 'completions/min_terminated_length': 789.0, 'completions/max_terminated_length': 16102.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3543020486831665, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020707275718450546, 'sampling/sampling_logp_difference/max': 11.246054649353027, 'sampling/importance_sampling_ratio/min': 1.3058716831437778e-05, 'sampling/importance_sampling_ratio/mean': 0.9998245239257812, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.631742948888132e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 133/1024 [5:16:17<37:29:36, 151.49s/it][AINFO 12-01 00:47:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:47:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:47:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:47:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 134/1024 [5:18:47<37:19:06, 150.95s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.003362236311659217, 'learning_rate': 1e-05, 'num_tokens': 99125709.0, 'completions/mean_length': 5361.625, 'completions/min_length': 859.0, 'completions/max_length': 15211.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5361.625, 'completions/min_terminated_length': 859.0, 'completions/max_terminated_length': 15211.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2914257347583771, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018883462995290756, 'sampling/sampling_logp_difference/max': 1.6576693058013916, 'sampling/importance_sampling_ratio/min': 0.20106850564479828, 'sampling/importance_sampling_ratio/mean': 1.000123143196106, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.8368983925684006e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 134/1024 [5:18:47<37:19:06, 150.95s/it][AINFO 12-01 00:50:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:50:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:50:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:50:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 135/1024 [5:21:15<37:01:14, 149.91s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.004756550304591656, 'learning_rate': 1e-05, 'num_tokens': 99842457.0, 'completions/mean_length': 5439.96875, 'completions/min_length': 613.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5177.31201171875, 'completions/min_terminated_length': 613.0, 'completions/max_terminated_length': 16096.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.43266528844833374, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.01653439551591873, 'sampling/sampling_logp_difference/max': 1.141636848449707, 'sampling/importance_sampling_ratio/min': 0.3192959427833557, 'sampling/importance_sampling_ratio/mean': 1.000024437904358, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.445614119205857e-05, 'epoch': 0.12}
+
+ 13%|█▎        | 135/1024 [5:21:15<37:01:14, 149.91s/it][AINFO 12-01 00:52:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:52:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:52:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:52:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 136/1024 [5:23:30<35:54:33, 145.58s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.003304203739389777, 'learning_rate': 1e-05, 'num_tokens': 100442771.0, 'completions/mean_length': 4549.515625, 'completions/min_length': 258.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4456.33056640625, 'completions/min_terminated_length': 258.0, 'completions/max_terminated_length': 13493.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.38087981939315796, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017065348103642464, 'sampling/sampling_logp_difference/max': 1.9317026138305664, 'sampling/importance_sampling_ratio/min': 0.14490127563476562, 'sampling/importance_sampling_ratio/mean': 1.0000040531158447, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.0341657899698475e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 136/1024 [5:23:30<35:54:33, 145.58s/it][AINFO 12-01 00:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:54:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:54:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 137/1024 [5:25:43<34:54:45, 141.70s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002885580761358142, 'learning_rate': 1e-05, 'num_tokens': 101093639.0, 'completions/mean_length': 4896.65625, 'completions/min_length': 617.0, 'completions/max_length': 14693.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4896.65625, 'completions/min_terminated_length': 617.0, 'completions/max_terminated_length': 14693.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.30197980999946594, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018230509012937546, 'sampling/sampling_logp_difference/max': 7.1592583656311035, 'sampling/importance_sampling_ratio/min': 0.0007776310667395592, 'sampling/importance_sampling_ratio/mean': 1.000004529953003, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.6989751859218813e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 137/1024 [5:25:43<34:54:45, 141.70s/it][AINFO 12-01 00:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:56:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 00:56:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 13%|█▎        | 138/1024 [5:28:51<38:19:19, 155.71s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0018814237555488944, 'learning_rate': 1e-05, 'num_tokens': 101994495.0, 'completions/mean_length': 6842.1875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6454.30859375, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15841.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.20411096513271332, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019331861287355423, 'sampling/sampling_logp_difference/max': 2.896165609359741, 'sampling/importance_sampling_ratio/min': 0.055234603583812714, 'sampling/importance_sampling_ratio/mean': 0.9999523162841797, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.178404310550832e-05, 'epoch': 0.13}
+
+ 13%|█▎        | 138/1024 [5:28:51<38:19:19, 155.71s/it][AINFO 12-01 01:00:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:00:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:00:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:00:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▎        | 139/1024 [5:31:04<36:35:30, 148.85s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0022405902855098248, 'learning_rate': 1e-05, 'num_tokens': 102601114.0, 'completions/mean_length': 4595.5859375, 'completions/min_length': 234.0, 'completions/max_length': 14917.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4595.5859375, 'completions/min_terminated_length': 234.0, 'completions/max_terminated_length': 14917.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.33797892928123474, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016325267031788826, 'sampling/sampling_logp_difference/max': 12.30532169342041, 'sampling/importance_sampling_ratio/min': 4.5275855882209726e-06, 'sampling/importance_sampling_ratio/mean': 1.0000278949737549, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.421765217761276e-05, 'epoch': 0.13}
+
+ 14%|█▎        | 139/1024 [5:31:04<36:35:30, 148.85s/it][AINFO 12-01 01:02:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:02:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:02:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:02:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▎        | 140/1024 [5:33:58<38:26:19, 156.54s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002165138954296708, 'learning_rate': 1e-05, 'num_tokens': 103466287.0, 'completions/mean_length': 6600.9140625, 'completions/min_length': 564.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6523.8818359375, 'completions/min_terminated_length': 564.0, 'completions/max_terminated_length': 16333.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.21040895581245422, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01811443641781807, 'sampling/sampling_logp_difference/max': 4.489964485168457, 'sampling/importance_sampling_ratio/min': 0.011221043765544891, 'sampling/importance_sampling_ratio/mean': 0.9999756217002869, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.66229777960325e-05, 'epoch': 0.13}
+
+ 14%|█▎        | 140/1024 [5:33:58<38:26:19, 156.54s/it][AINFO 12-01 01:05:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:05:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:05:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:05:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 141/1024 [5:36:25<37:38:28, 153.46s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002491142600774765, 'learning_rate': 1e-05, 'num_tokens': 104297798.0, 'completions/mean_length': 6350.6171875, 'completions/min_length': 931.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6191.357421875, 'completions/min_terminated_length': 931.0, 'completions/max_terminated_length': 15214.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0194578655064106, 'sampling/sampling_logp_difference/max': 9.046926498413086, 'sampling/importance_sampling_ratio/min': 0.00011775239545386285, 'sampling/importance_sampling_ratio/mean': 0.9999500513076782, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.093579968866834e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 141/1024 [5:36:25<37:38:28, 153.46s/it][AINFO 12-01 01:07:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:07:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:07:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:07:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 142/1024 [5:39:03<37:59:07, 155.04s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018304859986528754, 'learning_rate': 1e-05, 'num_tokens': 105084374.0, 'completions/mean_length': 6002.1875, 'completions/min_length': 593.0, 'completions/max_length': 15955.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6002.1875, 'completions/min_terminated_length': 593.0, 'completions/max_terminated_length': 15955.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.31823596358299255, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018424104899168015, 'sampling/sampling_logp_difference/max': 7.763115882873535, 'sampling/importance_sampling_ratio/min': 0.0004251298669259995, 'sampling/importance_sampling_ratio/mean': 0.9999578595161438, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.9718985539802816e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 142/1024 [5:39:03<37:59:07, 155.04s/it][AINFO 12-01 01:10:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:10:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:10:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:10:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 143/1024 [5:41:14<36:09:40, 147.76s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0013141741510480642, 'learning_rate': 1e-05, 'num_tokens': 105805043.0, 'completions/mean_length': 5474.6640625, 'completions/min_length': 4.0, 'completions/max_length': 15377.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5474.6640625, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15377.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020161191001534462, 'sampling/sampling_logp_difference/max': 4.460496425628662, 'sampling/importance_sampling_ratio/min': 0.011556625366210938, 'sampling/importance_sampling_ratio/mean': 0.9999996423721313, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.935256740485784e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 143/1024 [5:41:14<36:09:40, 147.76s/it][AINFO 12-01 01:12:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:12:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:12:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:12:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 144/1024 [5:43:26<34:54:46, 142.83s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013947135303169489, 'learning_rate': 1e-05, 'num_tokens': 106598066.0, 'completions/mean_length': 6049.4296875, 'completions/min_length': 678.0, 'completions/max_length': 14501.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6049.4296875, 'completions/min_terminated_length': 678.0, 'completions/max_terminated_length': 14501.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3345640003681183, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018179945647716522, 'sampling/sampling_logp_difference/max': 2.35213565826416, 'sampling/importance_sampling_ratio/min': 0.09516570717096329, 'sampling/importance_sampling_ratio/mean': 0.9999922513961792, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.533437165970099e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 144/1024 [5:43:26<34:54:46, 142.83s/it][AINFO 12-01 01:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:14:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:14:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 145/1024 [5:45:40<34:15:36, 140.31s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0017538930987939239, 'learning_rate': 1e-05, 'num_tokens': 107339409.0, 'completions/mean_length': 5643.9921875, 'completions/min_length': 335.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5473.51611328125, 'completions/min_terminated_length': 335.0, 'completions/max_terminated_length': 14701.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.3022220730781555, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020225487649440765, 'sampling/sampling_logp_difference/max': 1.0991297960281372, 'sampling/importance_sampling_ratio/min': 0.35633882880210876, 'sampling/importance_sampling_ratio/mean': 0.999894380569458, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.490754710990586e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 145/1024 [5:45:40<34:15:36, 140.31s/it][AINFO 12-01 01:16:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:16:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:16:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:16:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 146/1024 [5:48:12<35:03:29, 143.75s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0020841918885707855, 'learning_rate': 1e-05, 'num_tokens': 108232987.0, 'completions/mean_length': 6791.765625, 'completions/min_length': 372.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6639.50830078125, 'completions/min_terminated_length': 372.0, 'completions/max_terminated_length': 15201.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.3169426918029785, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019769608974456787, 'sampling/sampling_logp_difference/max': 7.72585391998291, 'sampling/importance_sampling_ratio/min': 0.0004412698617670685, 'sampling/importance_sampling_ratio/mean': 0.9999653100967407, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.435688975945595e-05, 'epoch': 0.13}
+
+ 14%|█▍        | 146/1024 [5:48:12<35:03:29, 143.75s/it][AINFO 12-01 01:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:19:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 147/1024 [5:50:48<35:55:52, 147.49s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.003091903170570731, 'learning_rate': 1e-05, 'num_tokens': 109058799.0, 'completions/mean_length': 6285.15625, 'completions/min_length': 107.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5959.38671875, 'completions/min_terminated_length': 107.0, 'completions/max_terminated_length': 14648.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.32036253809928894, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020912211388349533, 'sampling/sampling_logp_difference/max': 2.7648582458496094, 'sampling/importance_sampling_ratio/min': 0.06298502534627914, 'sampling/importance_sampling_ratio/mean': 1.0000264644622803, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.124521428318985e-05, 'epoch': 0.14}
+
+ 14%|█▍        | 147/1024 [5:50:48<35:55:52, 147.49s/it][AINFO 12-01 01:22:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:22:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:22:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:22:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 14%|█▍        | 148/1024 [5:53:25<36:37:13, 150.49s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.004796450491994619, 'learning_rate': 1e-05, 'num_tokens': 109829844.0, 'completions/mean_length': 5879.4765625, 'completions/min_length': 1012.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5712.73828125, 'completions/min_terminated_length': 1012.0, 'completions/max_terminated_length': 15484.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2709311842918396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01790485717356205, 'sampling/sampling_logp_difference/max': 2.164942741394043, 'sampling/importance_sampling_ratio/min': 0.1147565096616745, 'sampling/importance_sampling_ratio/mean': 0.9999945163726807, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.497631655591249e-05, 'epoch': 0.14}
+
+ 14%|█▍        | 148/1024 [5:53:25<36:37:13, 150.49s/it][AINFO 12-01 01:24:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:24:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:24:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:24:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▍        | 149/1024 [5:56:09<37:31:37, 154.40s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.002840550849214196, 'learning_rate': 1e-05, 'num_tokens': 110692949.0, 'completions/mean_length': 6589.0703125, 'completions/min_length': 782.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6273.1044921875, 'completions/min_terminated_length': 782.0, 'completions/max_terminated_length': 16254.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3897692859172821, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018374301493167877, 'sampling/sampling_logp_difference/max': 3.2123022079467773, 'sampling/importance_sampling_ratio/min': 0.040263812988996506, 'sampling/importance_sampling_ratio/mean': 1.000068187713623, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.671137034823914e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 149/1024 [5:56:09<37:31:37, 154.40s/it][AINFO 12-01 01:27:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:27:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:27:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:27:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▍        | 150/1024 [5:58:48<37:50:57, 155.90s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0025372474920004606, 'learning_rate': 1e-05, 'num_tokens': 111571332.0, 'completions/mean_length': 6712.0546875, 'completions/min_length': 25.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6400.05615234375, 'completions/min_terminated_length': 25.0, 'completions/max_terminated_length': 15916.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.33220988512039185, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019370142370462418, 'sampling/sampling_logp_difference/max': 2.218968629837036, 'sampling/importance_sampling_ratio/min': 0.10872118175029755, 'sampling/importance_sampling_ratio/mean': 0.9999641180038452, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.0054529108601855e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 150/1024 [5:58:48<37:50:57, 155.90s/it][AINFO 12-01 01:30:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:30:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:30:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:30:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▍        | 151/1024 [6:01:18<37:23:05, 154.16s/it][A
+                                                        [A{'loss': -0.0001, 'grad_norm': 0.0025457784067839384, 'learning_rate': 1e-05, 'num_tokens': 112328529.0, 'completions/mean_length': 5766.4765625, 'completions/min_length': 389.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5511.65625, 'completions/min_terminated_length': 389.0, 'completions/max_terminated_length': 16241.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.31800350546836853, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018980110064148903, 'sampling/sampling_logp_difference/max': 2.5051662921905518, 'sampling/importance_sampling_ratio/min': 0.08166201412677765, 'sampling/importance_sampling_ratio/mean': 1.0000464916229248, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9512954824895132e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 151/1024 [6:01:18<37:23:05, 154.16s/it][AINFO 12-01 01:32:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:32:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:32:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:32:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▍        | 152/1024 [6:03:40<36:25:59, 150.41s/it][A
+                                                        [A{'loss': -0.0001, 'grad_norm': 0.004166905768215656, 'learning_rate': 1e-05, 'num_tokens': 113108142.0, 'completions/mean_length': 5945.9765625, 'completions/min_length': 530.0, 'completions/max_length': 16290.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5945.9765625, 'completions/min_terminated_length': 530.0, 'completions/max_terminated_length': 16290.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.4026774764060974, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019215388223528862, 'sampling/sampling_logp_difference/max': 1.8917036056518555, 'sampling/importance_sampling_ratio/min': 0.15081465244293213, 'sampling/importance_sampling_ratio/mean': 1.0000014305114746, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.50051835893828e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 152/1024 [6:03:40<36:25:59, 150.41s/it][AINFO 12-01 01:34:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:34:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:34:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:34:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▍        | 153/1024 [6:06:16<36:47:35, 152.07s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0023706508800387383, 'learning_rate': 1e-05, 'num_tokens': 113936198.0, 'completions/mean_length': 6303.75, 'completions/min_length': 110.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6061.82421875, 'completions/min_terminated_length': 110.0, 'completions/max_terminated_length': 13820.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019496817141771317, 'sampling/sampling_logp_difference/max': 2.384566307067871, 'sampling/importance_sampling_ratio/min': 0.0921289324760437, 'sampling/importance_sampling_ratio/mean': 1.0000280141830444, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.662430481199408e-05, 'epoch': 0.14}
+
+ 15%|█▍        | 153/1024 [6:06:16<36:47:35, 152.07s/it][AINFO 12-01 01:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:37:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:37:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▌        | 154/1024 [6:08:47<36:40:34, 151.76s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002216464141383767, 'learning_rate': 1e-05, 'num_tokens': 114754590.0, 'completions/mean_length': 6236.125, 'completions/min_length': 497.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6156.22021484375, 'completions/min_terminated_length': 497.0, 'completions/max_terminated_length': 14980.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.31222954392433167, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019204700365662575, 'sampling/sampling_logp_difference/max': 4.976531505584717, 'sampling/importance_sampling_ratio/min': 0.0068979463540017605, 'sampling/importance_sampling_ratio/mean': 0.9999802112579346, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.913839352913783e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 154/1024 [6:08:47<36:40:34, 151.76s/it][AINFO 12-01 01:40:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:40:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:40:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:40:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▌        | 155/1024 [6:11:29<37:21:33, 154.77s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016367356292903423, 'learning_rate': 1e-05, 'num_tokens': 115643340.0, 'completions/mean_length': 6783.046875, 'completions/min_length': 13.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6473.33837890625, 'completions/min_terminated_length': 13.0, 'completions/max_terminated_length': 15588.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01901746541261673, 'sampling/sampling_logp_difference/max': 1.7498764991760254, 'sampling/importance_sampling_ratio/min': 0.1737954020500183, 'sampling/importance_sampling_ratio/mean': 1.0000264644622803, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.912410895485664e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 155/1024 [6:11:29<37:21:33, 154.77s/it][AINFO 12-01 01:42:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:42:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:42:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:42:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▌        | 156/1024 [6:14:01<37:07:20, 153.96s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018292386084794998, 'learning_rate': 1e-05, 'num_tokens': 116435944.0, 'completions/mean_length': 6060.59375, 'completions/min_length': 681.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5979.30712890625, 'completions/min_terminated_length': 681.0, 'completions/max_terminated_length': 16251.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3077537715435028, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01732320338487625, 'sampling/sampling_logp_difference/max': 3.4779717922210693, 'sampling/importance_sampling_ratio/min': 0.03086995892226696, 'sampling/importance_sampling_ratio/mean': 0.9999863505363464, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.078463848600222e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 156/1024 [6:14:01<37:07:20, 153.96s/it][AINFO 12-01 01:45:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:45:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:45:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:45:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▌        | 157/1024 [6:16:38<37:15:59, 154.74s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002669492270797491, 'learning_rate': 1e-05, 'num_tokens': 117202355.0, 'completions/mean_length': 5814.0859375, 'completions/min_length': 713.0, 'completions/max_length': 16221.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5814.0859375, 'completions/min_terminated_length': 713.0, 'completions/max_terminated_length': 16221.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2977413833141327, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01812589541077614, 'sampling/sampling_logp_difference/max': 2.9810123443603516, 'sampling/importance_sampling_ratio/min': 0.050741441547870636, 'sampling/importance_sampling_ratio/mean': 1.0000298023223877, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.807956879129051e-05, 'epoch': 0.14}
+
+ 15%|█▌        | 157/1024 [6:16:38<37:15:59, 154.74s/it][AINFO 12-01 01:47:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:47:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:47:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:47:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 15%|█▌        | 158/1024 [6:19:19<37:43:17, 156.81s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0024420206900686026, 'learning_rate': 1e-05, 'num_tokens': 118200372.0, 'completions/mean_length': 7648.6953125, 'completions/min_length': 878.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7579.91357421875, 'completions/min_terminated_length': 878.0, 'completions/max_terminated_length': 15766.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3135277032852173, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019768700003623962, 'sampling/sampling_logp_difference/max': 15.85932731628418, 'sampling/importance_sampling_ratio/min': 1.2953336181453778e-07, 'sampling/importance_sampling_ratio/mean': 1.000028133392334, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7749847908562515e-05, 'epoch': 0.15}
+
+ 15%|█▌        | 158/1024 [6:19:19<37:43:17, 156.81s/it][AINFO 12-01 01:50:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:50:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:50:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:50:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+[OpenTinker] 2025-12-01 01:52:26,399 - math_verify.grader - WARNING - Timeout during comparison
+[OpenTinker] 2025-12-01 01:52:31,405 - math_verify.grader - WARNING - Timeout during comparison
+
+ 16%|█▌        | 159/1024 [6:22:17<39:12:17, 163.17s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0019200673559680581, 'learning_rate': 1e-05, 'num_tokens': 119088651.0, 'completions/mean_length': 6802.9296875, 'completions/min_length': 405.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6331.72900390625, 'completions/min_terminated_length': 405.0, 'completions/max_terminated_length': 15477.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018352188169956207, 'sampling/sampling_logp_difference/max': 2.9691734313964844, 'sampling/importance_sampling_ratio/min': 0.05134573578834534, 'sampling/importance_sampling_ratio/mean': 0.9999526143074036, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.996142092750233e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 159/1024 [6:22:17<39:12:17, 163.17s/it][AINFO 12-01 01:53:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:53:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:53:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:53:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▌        | 160/1024 [6:25:14<40:08:45, 167.28s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0010218037059530616, 'learning_rate': 1e-05, 'num_tokens': 119914806.0, 'completions/mean_length': 6304.7109375, 'completions/min_length': 862.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6062.80810546875, 'completions/min_terminated_length': 862.0, 'completions/max_terminated_length': 14280.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019088296219706535, 'sampling/sampling_logp_difference/max': 5.878946781158447, 'sampling/importance_sampling_ratio/min': 0.002797730267047882, 'sampling/importance_sampling_ratio/mean': 1.000005841255188, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.3465745786998014e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 160/1024 [6:25:14<40:08:45, 167.28s/it][AINFO 12-01 01:56:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:56:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:56:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:56:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▌        | 161/1024 [6:27:34<38:10:01, 159.21s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0021929373033344746, 'learning_rate': 1e-05, 'num_tokens': 120749586.0, 'completions/mean_length': 6360.84375, 'completions/min_length': 656.0, 'completions/max_length': 13903.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6360.84375, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 13903.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.3164186477661133, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020394766703248024, 'sampling/sampling_logp_difference/max': 13.30980110168457, 'sampling/importance_sampling_ratio/min': 1.6581615227551083e-06, 'sampling/importance_sampling_ratio/mean': 1.0000377893447876, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.070383645033871e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 161/1024 [6:27:34<38:10:01, 159.21s/it][AINFO 12-01 01:58:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:58:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:58:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 01:58:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▌        | 162/1024 [6:29:50<36:23:18, 151.97s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0029296104330569506, 'learning_rate': 1e-05, 'num_tokens': 121520920.0, 'completions/mean_length': 5823.546875, 'completions/min_length': 400.0, 'completions/max_length': 13501.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5823.546875, 'completions/min_terminated_length': 400.0, 'completions/max_terminated_length': 13501.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.21648235619068146, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020458359271287918, 'sampling/sampling_logp_difference/max': 6.030230522155762, 'sampling/importance_sampling_ratio/min': 0.002404939616099, 'sampling/importance_sampling_ratio/mean': 1.0000139474868774, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.732826730309171e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 162/1024 [6:29:50<36:23:18, 151.97s/it][AINFO 12-01 02:01:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:01:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:01:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:01:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▌        | 163/1024 [6:32:42<37:47:20, 158.00s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0034187166020274162, 'learning_rate': 1e-05, 'num_tokens': 122456965.0, 'completions/mean_length': 7149.6640625, 'completions/min_length': 809.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7003.087890625, 'completions/min_terminated_length': 809.0, 'completions/max_terminated_length': 16199.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.31141096353530884, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019740914925932884, 'sampling/sampling_logp_difference/max': 1.4492828845977783, 'sampling/importance_sampling_ratio/min': 0.2347385734319687, 'sampling/importance_sampling_ratio/mean': 0.9999496936798096, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.220558123004594e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 163/1024 [6:32:42<37:47:20, 158.00s/it][AINFO 12-01 02:03:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:03:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:03:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:03:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▌        | 164/1024 [6:35:06<36:46:57, 153.97s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.003124989802017808, 'learning_rate': 1e-05, 'num_tokens': 123114356.0, 'completions/mean_length': 4971.4921875, 'completions/min_length': 432.0, 'completions/max_length': 15750.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4971.4921875, 'completions/min_terminated_length': 432.0, 'completions/max_terminated_length': 15750.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.19727616012096405, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01845206692814827, 'sampling/sampling_logp_difference/max': 2.7360496520996094, 'sampling/importance_sampling_ratio/min': 0.06482592970132828, 'sampling/importance_sampling_ratio/mean': 0.9999678134918213, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.925100900734833e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 164/1024 [6:35:06<36:46:57, 153.97s/it][AINFO 12-01 02:06:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:06:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:06:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:06:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▌        | 165/1024 [6:37:30<36:01:43, 150.99s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015882584266364574, 'learning_rate': 1e-05, 'num_tokens': 123957123.0, 'completions/mean_length': 6434.6171875, 'completions/min_length': 797.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6276.69091796875, 'completions/min_terminated_length': 797.0, 'completions/max_terminated_length': 15390.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.32089442014694214, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01870265230536461, 'sampling/sampling_logp_difference/max': 1.6867330074310303, 'sampling/importance_sampling_ratio/min': 0.20053833723068237, 'sampling/importance_sampling_ratio/mean': 1.000002384185791, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.506157119976706e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 165/1024 [6:37:30<36:01:43, 150.99s/it][AINFO 12-01 02:08:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:08:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:08:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:08:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▌        | 166/1024 [6:39:45<34:51:30, 146.26s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.004615283105522394, 'learning_rate': 1e-05, 'num_tokens': 124582077.0, 'completions/mean_length': 4734.953125, 'completions/min_length': 118.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4643.228515625, 'completions/min_terminated_length': 118.0, 'completions/max_terminated_length': 15888.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.23410367965698242, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02035534754395485, 'sampling/sampling_logp_difference/max': 1.4681086540222168, 'sampling/importance_sampling_ratio/min': 0.23661619424819946, 'sampling/importance_sampling_ratio/mean': 0.9999631643295288, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3604675940732704e-05, 'epoch': 0.15}
+
+ 16%|█▌        | 166/1024 [6:39:45<34:51:30, 146.26s/it][AINFO 12-01 02:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:11:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▋        | 167/1024 [6:42:18<35:14:06, 148.01s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.004369561094790697, 'learning_rate': 1e-05, 'num_tokens': 125413559.0, 'completions/mean_length': 6327.390625, 'completions/min_length': 41.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6248.20458984375, 'completions/min_terminated_length': 41.0, 'completions/max_terminated_length': 14906.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.23646268248558044, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022648120298981667, 'sampling/sampling_logp_difference/max': 6.613134384155273, 'sampling/importance_sampling_ratio/min': 0.00134261732455343, 'sampling/importance_sampling_ratio/mean': 0.9999423027038574, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.303239077467879e-05, 'epoch': 0.15}
+
+ 16%|█▋        | 167/1024 [6:42:18<35:14:06, 148.01s/it][AINFO 12-01 02:13:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:13:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:13:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:13:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 16%|█▋        | 168/1024 [6:44:54<35:49:03, 150.64s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0014957950916141272, 'learning_rate': 1e-05, 'num_tokens': 126277104.0, 'completions/mean_length': 6606.6328125, 'completions/min_length': 619.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6529.6455078125, 'completions/min_terminated_length': 619.0, 'completions/max_terminated_length': 15068.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01851297914981842, 'sampling/sampling_logp_difference/max': 1.4878109693527222, 'sampling/importance_sampling_ratio/min': 0.22586654126644135, 'sampling/importance_sampling_ratio/mean': 0.9999467134475708, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.176285028734128e-05, 'epoch': 0.15}
+
+ 16%|█▋        | 168/1024 [6:44:54<35:49:03, 150.64s/it][AINFO 12-01 02:16:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:16:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:16:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:16:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 169/1024 [6:47:35<36:29:56, 153.68s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.00261589209549129, 'learning_rate': 1e-05, 'num_tokens': 127169447.0, 'completions/mean_length': 6810.4921875, 'completions/min_length': 689.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6580.728515625, 'completions/min_terminated_length': 689.0, 'completions/max_terminated_length': 15341.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.26485776901245117, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01845637522637844, 'sampling/sampling_logp_difference/max': 2.7917985916137695, 'sampling/importance_sampling_ratio/min': 0.06131083890795708, 'sampling/importance_sampling_ratio/mean': 1.0000319480895996, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.323500343161868e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 169/1024 [6:47:35<36:29:56, 153.68s/it][AINFO 12-01 02:18:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:18:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:18:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:18:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 170/1024 [6:50:15<36:53:21, 155.51s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.012636496685445309, 'learning_rate': 1e-05, 'num_tokens': 128054118.0, 'completions/mean_length': 6770.3046875, 'completions/min_length': 99.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6129.39208984375, 'completions/min_terminated_length': 99.0, 'completions/max_terminated_length': 15893.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.41793978214263916, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.016856789588928223, 'sampling/sampling_logp_difference/max': 5.50988245010376, 'sampling/importance_sampling_ratio/min': 0.004046583082526922, 'sampling/importance_sampling_ratio/mean': 0.9999832510948181, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.379216660614475e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 170/1024 [6:50:15<36:53:21, 155.51s/it][AINFO 12-01 02:21:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:21:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:21:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:21:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 171/1024 [6:52:41<36:09:13, 152.58s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013498460175469518, 'learning_rate': 1e-05, 'num_tokens': 128871451.0, 'completions/mean_length': 6235.2265625, 'completions/min_length': 886.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6155.31494140625, 'completions/min_terminated_length': 886.0, 'completions/max_terminated_length': 14444.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02060030773282051, 'sampling/sampling_logp_difference/max': 2.3952834606170654, 'sampling/importance_sampling_ratio/min': 0.09114684164524078, 'sampling/importance_sampling_ratio/mean': 0.9999770522117615, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6124366374679084e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 171/1024 [6:52:41<36:09:13, 152.58s/it][AINFO 12-01 02:23:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:23:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:23:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:23:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 172/1024 [6:54:59<35:04:54, 148.23s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.024525245651602745, 'learning_rate': 1e-05, 'num_tokens': 129574594.0, 'completions/mean_length': 5347.5546875, 'completions/min_length': 17.0, 'completions/max_length': 16322.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5347.5546875, 'completions/min_terminated_length': 17.0, 'completions/max_terminated_length': 16322.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018004950135946274, 'sampling/sampling_logp_difference/max': 1.820425271987915, 'sampling/importance_sampling_ratio/min': 0.16195686161518097, 'sampling/importance_sampling_ratio/mean': 0.9999852180480957, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.8532597727680695e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 172/1024 [6:54:59<35:04:54, 148.23s/it][AINFO 12-01 02:26:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:26:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:26:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:26:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 173/1024 [6:57:34<35:33:57, 150.45s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0010482141515240073, 'learning_rate': 1e-05, 'num_tokens': 130517996.0, 'completions/mean_length': 7214.265625, 'completions/min_length': 303.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7068.71484375, 'completions/min_terminated_length': 303.0, 'completions/max_terminated_length': 16200.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02076195925474167, 'sampling/sampling_logp_difference/max': 2.5702314376831055, 'sampling/importance_sampling_ratio/min': 0.07651783525943756, 'sampling/importance_sampling_ratio/mean': 0.9999964833259583, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.357747425525304e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 173/1024 [6:57:34<35:33:57, 150.45s/it][AINFO 12-01 02:28:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:28:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:28:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:28:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 174/1024 [7:00:18<36:28:08, 154.46s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0022134080063551664, 'learning_rate': 1e-05, 'num_tokens': 131545948.0, 'completions/mean_length': 7850.5, 'completions/min_length': 395.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7715.0478515625, 'completions/min_terminated_length': 395.0, 'completions/max_terminated_length': 14923.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.3124619722366333, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02226732112467289, 'sampling/sampling_logp_difference/max': 7.143752574920654, 'sampling/importance_sampling_ratio/min': 0.0007897828472778201, 'sampling/importance_sampling_ratio/mean': 0.9999847412109375, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.695274103345582e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 174/1024 [7:00:18<36:28:08, 154.46s/it][AINFO 12-01 02:31:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:31:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:31:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:31:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 175/1024 [7:02:57<36:43:37, 155.73s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002040535444393754, 'learning_rate': 1e-05, 'num_tokens': 132452352.0, 'completions/mean_length': 6935.46875, 'completions/min_length': 772.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6861.07080078125, 'completions/min_terminated_length': 772.0, 'completions/max_terminated_length': 14302.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.1841355413198471, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019656775519251823, 'sampling/sampling_logp_difference/max': 1.7382960319519043, 'sampling/importance_sampling_ratio/min': 0.22274024784564972, 'sampling/importance_sampling_ratio/mean': 1.000036358833313, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0603320624322805e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 175/1024 [7:02:57<36:43:37, 155.73s/it][AINFO 12-01 02:34:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:34:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:34:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:34:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 176/1024 [7:05:06<34:47:15, 147.68s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002847716212272644, 'learning_rate': 1e-05, 'num_tokens': 133175528.0, 'completions/mean_length': 5512.0, 'completions/min_length': 299.0, 'completions/max_length': 13980.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5512.0, 'completions/min_terminated_length': 299.0, 'completions/max_terminated_length': 13980.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2858891487121582, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019579965621232986, 'sampling/sampling_logp_difference/max': 2.778414487838745, 'sampling/importance_sampling_ratio/min': 0.062136948108673096, 'sampling/importance_sampling_ratio/mean': 0.9999563694000244, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.102384545807581e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 176/1024 [7:05:06<34:47:15, 147.68s/it][AINFO 12-01 02:36:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:36:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:36:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:36:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 177/1024 [7:07:57<36:25:37, 154.83s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0024176358710974455, 'learning_rate': 1e-05, 'num_tokens': 134166497.0, 'completions/mean_length': 7586.6953125, 'completions/min_length': 149.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7375.560546875, 'completions/min_terminated_length': 149.0, 'completions/max_terminated_length': 16361.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3216509222984314, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019919399172067642, 'sampling/sampling_logp_difference/max': 4.888542175292969, 'sampling/importance_sampling_ratio/min': 0.007532395422458649, 'sampling/importance_sampling_ratio/mean': 1.0000462532043457, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.9729748045356246e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 177/1024 [7:07:57<36:25:37, 154.83s/it][AINFO 12-01 02:39:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:39:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:39:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:39:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 178/1024 [7:10:52<37:49:12, 160.94s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018155807629227638, 'learning_rate': 1e-05, 'num_tokens': 135072592.0, 'completions/mean_length': 6907.3671875, 'completions/min_length': 488.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6601.6689453125, 'completions/min_terminated_length': 488.0, 'completions/max_terminated_length': 14999.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2959064245223999, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019855886697769165, 'sampling/sampling_logp_difference/max': 2.00116229057312, 'sampling/importance_sampling_ratio/min': 0.1351780742406845, 'sampling/importance_sampling_ratio/mean': 1.0000094175338745, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5873530464414216e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 178/1024 [7:10:52<37:49:12, 160.94s/it][AINFO 12-01 02:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:42:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:42:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 17%|█▋        | 179/1024 [7:13:29<37:26:14, 159.50s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002277001505717635, 'learning_rate': 1e-05, 'num_tokens': 135910123.0, 'completions/mean_length': 6402.5234375, 'completions/min_length': 229.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6244.087890625, 'completions/min_terminated_length': 229.0, 'completions/max_terminated_length': 16185.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.30061954259872437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019020868465304375, 'sampling/sampling_logp_difference/max': 15.955404281616211, 'sampling/importance_sampling_ratio/min': 1.1766735497076297e-07, 'sampling/importance_sampling_ratio/mean': 1.0000461339950562, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2573161156033166e-05, 'epoch': 0.16}
+
+ 17%|█▋        | 179/1024 [7:13:29<37:26:14, 159.50s/it][AINFO 12-01 02:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:44:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 180/1024 [7:16:20<38:12:29, 162.97s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0023481242824345827, 'learning_rate': 1e-05, 'num_tokens': 136872309.0, 'completions/mean_length': 7370.015625, 'completions/min_length': 1122.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7299.03955078125, 'completions/min_terminated_length': 1122.0, 'completions/max_terminated_length': 15990.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.3158867359161377, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02003081701695919, 'sampling/sampling_logp_difference/max': 9.497355461120605, 'sampling/importance_sampling_ratio/min': 7.505004032282159e-05, 'sampling/importance_sampling_ratio/mean': 0.9999698400497437, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2283937748143217e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 180/1024 [7:16:20<38:12:29, 162.97s/it][AINFO 12-01 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:47:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 181/1024 [7:18:51<37:20:31, 159.47s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001217237557284534, 'learning_rate': 1e-05, 'num_tokens': 137609476.0, 'completions/mean_length': 5626.9921875, 'completions/min_length': 1207.0, 'completions/max_length': 14608.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5626.9921875, 'completions/min_terminated_length': 1207.0, 'completions/max_terminated_length': 14608.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2874867618083954, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016615722328424454, 'sampling/sampling_logp_difference/max': 1.7648506164550781, 'sampling/importance_sampling_ratio/min': 0.1712123602628708, 'sampling/importance_sampling_ratio/mean': 1.000102162361145, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.939877157994488e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 181/1024 [7:18:51<37:20:31, 159.47s/it][AINFO 12-01 02:50:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:50:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:50:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:50:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 182/1024 [7:21:50<38:41:25, 165.42s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0029076675418764353, 'learning_rate': 1e-05, 'num_tokens': 138620051.0, 'completions/mean_length': 7724.3671875, 'completions/min_length': 626.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6990.5, 'completions/min_terminated_length': 626.0, 'completions/max_terminated_length': 15511.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.3543020486831665, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01926334947347641, 'sampling/sampling_logp_difference/max': 5.874875068664551, 'sampling/importance_sampling_ratio/min': 0.0028091452550143003, 'sampling/importance_sampling_ratio/mean': 0.9999242424964905, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 9.256054954676074e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 182/1024 [7:21:50<38:41:25, 165.42s/it][AINFO 12-01 02:53:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:53:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:53:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:53:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 183/1024 [7:23:53<35:38:57, 152.60s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017144179437309504, 'learning_rate': 1e-05, 'num_tokens': 139365324.0, 'completions/mean_length': 5657.3828125, 'completions/min_length': 492.0, 'completions/max_length': 14155.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5657.3828125, 'completions/min_terminated_length': 492.0, 'completions/max_terminated_length': 14155.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.345874547958374, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018414299935102463, 'sampling/sampling_logp_difference/max': 15.59090805053711, 'sampling/importance_sampling_ratio/min': 1.6941609715104278e-07, 'sampling/importance_sampling_ratio/mean': 0.9999309778213501, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.774583109996456e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 183/1024 [7:23:53<35:38:57, 152.60s/it][AINFO 12-01 02:55:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:55:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:55:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:55:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 184/1024 [7:26:30<35:56:56, 154.07s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0021617524325847626, 'learning_rate': 1e-05, 'num_tokens': 140238978.0, 'completions/mean_length': 6675.609375, 'completions/min_length': 658.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6442.6083984375, 'completions/min_terminated_length': 658.0, 'completions/max_terminated_length': 14361.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.41558074951171875, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.018270963802933693, 'sampling/sampling_logp_difference/max': 4.54893684387207, 'sampling/importance_sampling_ratio/min': 0.01057844515889883, 'sampling/importance_sampling_ratio/mean': 1.0000407695770264, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.67587169118633e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 184/1024 [7:26:30<35:56:56, 154.07s/it][AINFO 12-01 02:57:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:57:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:57:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 02:57:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 185/1024 [7:28:49<34:48:43, 149.37s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.00208807410672307, 'learning_rate': 1e-05, 'num_tokens': 141070303.0, 'completions/mean_length': 6314.2265625, 'completions/min_length': 1018.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6234.93701171875, 'completions/min_terminated_length': 1018.0, 'completions/max_terminated_length': 13339.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.35377800464630127, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01983012817800045, 'sampling/sampling_logp_difference/max': 2.5973153114318848, 'sampling/importance_sampling_ratio/min': 0.07447324693202972, 'sampling/importance_sampling_ratio/mean': 0.9999605417251587, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.192043028680928e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 185/1024 [7:28:49<34:48:43, 149.37s/it][AINFO 12-01 03:00:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:00:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:00:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:00:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 186/1024 [7:31:14<34:26:55, 147.99s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0019218949601054192, 'learning_rate': 1e-05, 'num_tokens': 141943130.0, 'completions/mean_length': 6662.9609375, 'completions/min_length': 1514.0, 'completions/max_length': 14336.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6662.9609375, 'completions/min_terminated_length': 1514.0, 'completions/max_terminated_length': 14336.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2937847673892975, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020462045446038246, 'sampling/sampling_logp_difference/max': 1.8307583332061768, 'sampling/importance_sampling_ratio/min': 0.16029196977615356, 'sampling/importance_sampling_ratio/mean': 0.9999896883964539, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.218937121753697e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 186/1024 [7:31:14<34:26:55, 147.99s/it][AINFO 12-01 03:02:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:02:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:02:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:02:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 187/1024 [7:33:45<34:37:38, 148.93s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001215338590554893, 'learning_rate': 1e-05, 'num_tokens': 142845552.0, 'completions/mean_length': 6903.921875, 'completions/min_length': 916.0, 'completions/max_length': 15746.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6903.921875, 'completions/min_terminated_length': 916.0, 'completions/max_terminated_length': 15746.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.17806214094161987, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02025502547621727, 'sampling/sampling_logp_difference/max': 3.6229028701782227, 'sampling/importance_sampling_ratio/min': 0.026705041527748108, 'sampling/importance_sampling_ratio/mean': 0.999931275844574, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6634882462749374e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 187/1024 [7:33:45<34:37:38, 148.93s/it][AINFO 12-01 03:05:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:05:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:05:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:05:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 188/1024 [7:36:40<36:25:00, 156.82s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0022867487277835608, 'learning_rate': 1e-05, 'num_tokens': 143691604.0, 'completions/mean_length': 6415.71875, 'completions/min_length': 848.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6094.1611328125, 'completions/min_terminated_length': 848.0, 'completions/max_terminated_length': 16214.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.17806704342365265, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020513443276286125, 'sampling/sampling_logp_difference/max': 1.6848806142807007, 'sampling/importance_sampling_ratio/min': 0.18546657264232635, 'sampling/importance_sampling_ratio/mean': 1.0000203847885132, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.9824540572699334e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 188/1024 [7:36:40<36:25:00, 156.82s/it][AINFO 12-01 03:07:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:07:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:07:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:07:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 18%|█▊        | 189/1024 [7:39:19<36:30:12, 157.38s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0024151524994522333, 'learning_rate': 1e-05, 'num_tokens': 144419217.0, 'completions/mean_length': 5533.3515625, 'completions/min_length': 748.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5272.93603515625, 'completions/min_terminated_length': 748.0, 'completions/max_terminated_length': 16378.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2756394147872925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01675681583583355, 'sampling/sampling_logp_difference/max': 2.888376235961914, 'sampling/importance_sampling_ratio/min': 0.05566653236746788, 'sampling/importance_sampling_ratio/mean': 0.9999334216117859, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.591878476072452e-05, 'epoch': 0.17}
+
+ 18%|█▊        | 189/1024 [7:39:19<36:30:12, 157.38s/it][AINFO 12-01 03:10:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:10:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:10:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:10:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▊        | 190/1024 [7:42:23<38:18:39, 165.37s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002040195045992732, 'learning_rate': 1e-05, 'num_tokens': 145449740.0, 'completions/mean_length': 7894.8359375, 'completions/min_length': 472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7691.09619140625, 'completions/min_terminated_length': 472.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02056686207652092, 'sampling/sampling_logp_difference/max': 2.882246971130371, 'sampling/importance_sampling_ratio/min': 0.05600877106189728, 'sampling/importance_sampling_ratio/mean': 1.0000523328781128, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.2846991163969506e-05, 'epoch': 0.17}
+
+ 19%|█▊        | 190/1024 [7:42:23<38:18:39, 165.37s/it][AINFO 12-01 03:13:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:13:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:13:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:13:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▊        | 191/1024 [7:45:11<38:29:08, 166.32s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0023817906621843576, 'learning_rate': 1e-05, 'num_tokens': 146343021.0, 'completions/mean_length': 6829.2578125, 'completions/min_length': 1150.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6106.63037109375, 'completions/min_terminated_length': 1150.0, 'completions/max_terminated_length': 16089.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.31141096353530884, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01734153926372528, 'sampling/sampling_logp_difference/max': 2.2829995155334473, 'sampling/importance_sampling_ratio/min': 0.1019778624176979, 'sampling/importance_sampling_ratio/mean': 0.9999542832374573, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.51802479296748e-05, 'epoch': 0.18}
+
+ 19%|█▊        | 191/1024 [7:45:11<38:29:08, 166.32s/it][AINFO 12-01 03:16:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:16:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:16:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:16:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 192/1024 [7:47:33<36:44:35, 158.99s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001516043790616095, 'learning_rate': 1e-05, 'num_tokens': 147281547.0, 'completions/mean_length': 7197.671875, 'completions/min_length': 631.0, 'completions/max_length': 14784.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7197.671875, 'completions/min_terminated_length': 631.0, 'completions/max_terminated_length': 14784.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.26932865381240845, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01847129687666893, 'sampling/sampling_logp_difference/max': 2.729764223098755, 'sampling/importance_sampling_ratio/min': 0.06523466855287552, 'sampling/importance_sampling_ratio/mean': 1.0000518560409546, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.965423886460485e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 192/1024 [7:47:33<36:44:35, 158.99s/it][AINFO 12-01 03:18:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:18:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:18:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:18:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 193/1024 [7:50:19<37:12:13, 161.17s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015142245683819056, 'learning_rate': 1e-05, 'num_tokens': 148266568.0, 'completions/mean_length': 7556.1015625, 'completions/min_length': 261.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7197.24365234375, 'completions/min_terminated_length': 261.0, 'completions/max_terminated_length': 14758.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.2451099157333374, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0204915851354599, 'sampling/sampling_logp_difference/max': 2.1921844482421875, 'sampling/importance_sampling_ratio/min': 0.111672542989254, 'sampling/importance_sampling_ratio/mean': 0.9999825358390808, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.9722614019410685e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 193/1024 [7:50:19<37:12:13, 161.17s/it][AINFO 12-01 03:21:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:21:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:21:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:21:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 194/1024 [7:52:51<36:31:44, 158.44s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0006635223980993032, 'learning_rate': 1e-05, 'num_tokens': 149133377.0, 'completions/mean_length': 6633.4453125, 'completions/min_length': 729.0, 'completions/max_length': 16243.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6633.4453125, 'completions/min_terminated_length': 729.0, 'completions/max_terminated_length': 16243.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.12756995856761932, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02072693221271038, 'sampling/sampling_logp_difference/max': 1.687322735786438, 'sampling/importance_sampling_ratio/min': 0.18501418828964233, 'sampling/importance_sampling_ratio/mean': 0.9999097585678101, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.296354057307326e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 194/1024 [7:52:51<36:31:44, 158.44s/it][AINFO 12-01 03:24:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:24:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:24:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:24:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 195/1024 [7:55:59<38:28:36, 167.09s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0022597976494580507, 'learning_rate': 1e-05, 'num_tokens': 150106603.0, 'completions/mean_length': 7474.890625, 'completions/min_length': 676.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7404.740234375, 'completions/min_terminated_length': 676.0, 'completions/max_terminated_length': 15998.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.31823596358299255, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02036304771900177, 'sampling/sampling_logp_difference/max': 2.7820920944213867, 'sampling/importance_sampling_ratio/min': 0.06190885230898857, 'sampling/importance_sampling_ratio/mean': 0.9999575018882751, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.33726078831387e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 195/1024 [7:55:59<38:28:36, 167.09s/it][AINFO 12-01 03:27:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:27:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:27:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:27:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 196/1024 [7:58:35<37:41:49, 163.90s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0020665964111685753, 'learning_rate': 1e-05, 'num_tokens': 151001850.0, 'completions/mean_length': 6808.3046875, 'completions/min_length': 821.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6419.04833984375, 'completions/min_terminated_length': 821.0, 'completions/max_terminated_length': 14975.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01825709640979767, 'sampling/sampling_logp_difference/max': 5.476752758026123, 'sampling/importance_sampling_ratio/min': 0.004182890523225069, 'sampling/importance_sampling_ratio/mean': 0.9999337196350098, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.763075432696496e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 196/1024 [7:58:35<37:41:49, 163.90s/it][AINFO 12-01 03:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:29:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 197/1024 [8:01:24<38:00:15, 165.44s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0027867560274899006, 'learning_rate': 1e-05, 'num_tokens': 151918265.0, 'completions/mean_length': 7005.9296875, 'completions/min_length': 1238.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6857.07177734375, 'completions/min_terminated_length': 1238.0, 'completions/max_terminated_length': 16310.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018458642065525055, 'sampling/sampling_logp_difference/max': 3.461454391479492, 'sampling/importance_sampling_ratio/min': 0.03138408437371254, 'sampling/importance_sampling_ratio/mean': 1.0000183582305908, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.4024538965459215e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 197/1024 [8:01:24<38:00:15, 165.44s/it][AINFO 12-01 03:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:32:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+[OpenTinker] 2025-12-01 03:34:17,740 - math_verify.grader - WARNING - Timeout during comparison
+
+ 19%|█▉        | 198/1024 [8:04:00<37:17:09, 162.51s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016060172347351909, 'learning_rate': 1e-05, 'num_tokens': 152816200.0, 'completions/mean_length': 6857.8671875, 'completions/min_length': 407.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6629.240234375, 'completions/min_terminated_length': 407.0, 'completions/max_terminated_length': 15059.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.345874547958374, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018001491203904152, 'sampling/sampling_logp_difference/max': 2.5224146842956543, 'sampling/importance_sampling_ratio/min': 0.08026555925607681, 'sampling/importance_sampling_ratio/mean': 1.000020980834961, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.51457432670577e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 198/1024 [8:04:00<37:17:09, 162.51s/it][AINFO 12-01 03:35:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:35:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:35:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:35:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 19%|█▉        | 199/1024 [8:06:33<36:34:54, 159.63s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001727749709971249, 'learning_rate': 1e-05, 'num_tokens': 153688799.0, 'completions/mean_length': 6670.2421875, 'completions/min_length': 320.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6516.05615234375, 'completions/min_terminated_length': 320.0, 'completions/max_terminated_length': 16216.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2590838074684143, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01887558586895466, 'sampling/sampling_logp_difference/max': 3.6163506507873535, 'sampling/importance_sampling_ratio/min': 0.026880595833063126, 'sampling/importance_sampling_ratio/mean': 1.0000410079956055, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.869990564453474e-05, 'epoch': 0.18}
+
+ 19%|█▉        | 199/1024 [8:06:33<36:34:54, 159.63s/it][AINFO 12-01 03:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:37:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 200/1024 [8:09:33<37:58:14, 165.89s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015861130086705089, 'learning_rate': 1e-05, 'num_tokens': 154637684.0, 'completions/mean_length': 7258.1015625, 'completions/min_length': 887.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6730.15673828125, 'completions/min_terminated_length': 887.0, 'completions/max_terminated_length': 15575.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.32879000902175903, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0175564456731081, 'sampling/sampling_logp_difference/max': 2.2665085792541504, 'sampling/importance_sampling_ratio/min': 0.10367351770401001, 'sampling/importance_sampling_ratio/mean': 1.0000615119934082, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.159937124219141e-05, 'epoch': 0.18}
+
+ 20%|█▉        | 200/1024 [8:09:33<37:58:14, 165.89s/it][AINFO 12-01 03:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:40:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 201/1024 [8:12:12<37:26:30, 163.78s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002060150494799018, 'learning_rate': 1e-05, 'num_tokens': 155529936.0, 'completions/mean_length': 6820.84375, 'completions/min_length': 355.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6745.54345703125, 'completions/min_terminated_length': 355.0, 'completions/max_terminated_length': 15804.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.31140607595443726, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020233113318681717, 'sampling/sampling_logp_difference/max': 9.126523971557617, 'sampling/importance_sampling_ratio/min': 0.00010874291911022738, 'sampling/importance_sampling_ratio/mean': 1.0000131130218506, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.8167281622445444e-05, 'epoch': 0.18}
+
+ 20%|█▉        | 201/1024 [8:12:12<37:26:30, 163.78s/it][AINFO 12-01 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:43:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 20%|█▉        | 202/1024 [8:14:50<36:58:51, 161.96s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0030148716177791357, 'learning_rate': 1e-05, 'num_tokens': 156387579.0, 'completions/mean_length': 6556.5234375, 'completions/min_length': 524.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 5723.6865234375, 'completions/min_terminated_length': 524.0, 'completions/max_terminated_length': 16280.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.23486506938934326, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018939711153507233, 'sampling/sampling_logp_difference/max': 4.909085273742676, 'sampling/importance_sampling_ratio/min': 0.0073792352341115475, 'sampling/importance_sampling_ratio/mean': 1.0000196695327759, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.362708955341077e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 202/1024 [8:14:50<36:58:51, 161.96s/it][AINFO 12-01 03:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:46:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:46:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 20%|█▉        | 203/1024 [8:17:45<37:49:37, 165.87s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.003277840092778206, 'learning_rate': 1e-05, 'num_tokens': 157437126.0, 'completions/mean_length': 8023.3359375, 'completions/min_length': 611.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7890.62744140625, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 15917.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2380426526069641, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019101303070783615, 'sampling/sampling_logp_difference/max': 7.590854644775391, 'sampling/importance_sampling_ratio/min': 0.0005050491890870035, 'sampling/importance_sampling_ratio/mean': 0.9999302625656128, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1920544990480266e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 203/1024 [8:17:45<37:49:37, 165.87s/it][AINFO 12-01 03:49:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:49:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:49:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:49:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 20%|█▉        | 204/1024 [8:20:15<36:41:40, 161.10s/it][A
+                                                        [A{'loss': 0.0006, 'grad_norm': 0.002455161651596427, 'learning_rate': 1e-05, 'num_tokens': 158183600.0, 'completions/mean_length': 5659.265625, 'completions/min_length': 736.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5574.81884765625, 'completions/min_terminated_length': 736.0, 'completions/max_terminated_length': 13552.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019287480041384697, 'sampling/sampling_logp_difference/max': 6.631682872772217, 'sampling/importance_sampling_ratio/min': 0.001317943329922855, 'sampling/importance_sampling_ratio/mean': 1.0001133680343628, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.13224324802286e-05, 'epoch': 0.19}
+
+ 20%|█▉        | 204/1024 [8:20:15<36:41:40, 161.10s/it][AINFO 12-01 03:51:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:51:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:51:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:51:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 20%|██        | 205/1024 [8:22:43<35:47:18, 157.31s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0021509125363081694, 'learning_rate': 1e-05, 'num_tokens': 159058223.0, 'completions/mean_length': 6690.4296875, 'completions/min_length': 1472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6614.1025390625, 'completions/min_terminated_length': 1472.0, 'completions/max_terminated_length': 14734.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2580180764198303, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019935518503189087, 'sampling/sampling_logp_difference/max': 2.4222536087036133, 'sampling/importance_sampling_ratio/min': 0.08872144669294357, 'sampling/importance_sampling_ratio/mean': 0.9999189376831055, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3251817689670133e-05, 'epoch': 0.19}
+
+ 20%|██        | 205/1024 [8:22:43<35:47:18, 157.31s/it][AINFO 12-01 03:54:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:54:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:54:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:54:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 20%|██        | 206/1024 [8:25:31<36:28:39, 160.54s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0010466414969414473, 'learning_rate': 1e-05, 'num_tokens': 160006394.0, 'completions/mean_length': 7255.2734375, 'completions/min_length': 1141.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7110.37353515625, 'completions/min_terminated_length': 1141.0, 'completions/max_terminated_length': 16265.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2527858018875122, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01933729462325573, 'sampling/sampling_logp_difference/max': 2.1732261180877686, 'sampling/importance_sampling_ratio/min': 0.11380986124277115, 'sampling/importance_sampling_ratio/mean': 0.9999536275863647, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.65675108848518e-05, 'epoch': 0.19}
+
+ 20%|██        | 206/1024 [8:25:31<36:28:39, 160.54s/it][AINFO 12-01 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:56:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 20%|██        | 207/1024 [8:28:07<36:07:53, 159.21s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015736498171463609, 'learning_rate': 1e-05, 'num_tokens': 160920010.0, 'completions/mean_length': 6957.0, 'completions/min_length': 675.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6807.365234375, 'completions/min_terminated_length': 675.0, 'completions/max_terminated_length': 15625.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.19728107750415802, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01956935226917267, 'sampling/sampling_logp_difference/max': 2.623157024383545, 'sampling/importance_sampling_ratio/min': 0.07257338613271713, 'sampling/importance_sampling_ratio/mean': 0.9999431371688843, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.8892021482060954e-05, 'epoch': 0.19}
+
+ 20%|██        | 207/1024 [8:28:07<36:07:53, 159.21s/it][AINFO 12-01 03:59:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:59:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:59:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 03:59:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 20%|██        | 208/1024 [8:30:09<33:32:46, 148.00s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002172674285247922, 'learning_rate': 1e-05, 'num_tokens': 161632021.0, 'completions/mean_length': 5411.6484375, 'completions/min_length': 564.0, 'completions/max_length': 14053.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5411.6484375, 'completions/min_terminated_length': 564.0, 'completions/max_terminated_length': 14053.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.34033796191215515, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01959376223385334, 'sampling/sampling_logp_difference/max': 2.1655282974243164, 'sampling/importance_sampling_ratio/min': 0.11468932777643204, 'sampling/importance_sampling_ratio/mean': 0.9999414682388306, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4218510222672194e-05, 'epoch': 0.19}
+
+ 20%|██        | 208/1024 [8:30:09<33:32:46, 148.00s/it][AINFO 12-01 04:01:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:01:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:01:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:01:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 20%|██        | 209/1024 [8:32:41<33:47:12, 149.24s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001568320207297802, 'learning_rate': 1e-05, 'num_tokens': 162305265.0, 'completions/mean_length': 5110.34375, 'completions/min_length': 205.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4931.39697265625, 'completions/min_terminated_length': 205.0, 'completions/max_terminated_length': 16238.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.33008819818496704, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017794717103242874, 'sampling/sampling_logp_difference/max': 3.7610549926757812, 'sampling/importance_sampling_ratio/min': 0.02325918897986412, 'sampling/importance_sampling_ratio/mean': 1.0000114440917969, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.370017467816069e-05, 'epoch': 0.19}
+
+ 20%|██        | 209/1024 [8:32:41<33:47:12, 149.24s/it][AINFO 12-01 04:03:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:03:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:03:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:03:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+[OpenTinker] 2025-12-01 04:05:23,049 - math_verify.grader - WARNING - Timeout during comparison
+
+ 21%|██        | 210/1024 [8:34:57<32:50:26, 145.24s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002526822965592146, 'learning_rate': 1e-05, 'num_tokens': 163030411.0, 'completions/mean_length': 5500.453125, 'completions/min_length': 636.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5239.248046875, 'completions/min_terminated_length': 636.0, 'completions/max_terminated_length': 15062.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01833813264966011, 'sampling/sampling_logp_difference/max': 1.8741247653961182, 'sampling/importance_sampling_ratio/min': 0.15348924696445465, 'sampling/importance_sampling_ratio/mean': 1.000065565109253, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.803058345714817e-05, 'epoch': 0.19}
+
+ 21%|██        | 210/1024 [8:34:57<32:50:26, 145.24s/it][AINFO 12-01 04:06:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:06:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:06:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:06:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██        | 211/1024 [8:37:44<34:13:20, 151.54s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.00221262127161026, 'learning_rate': 1e-05, 'num_tokens': 163952654.0, 'completions/mean_length': 7076.0234375, 'completions/min_length': 1516.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6775.76611328125, 'completions/min_terminated_length': 1516.0, 'completions/max_terminated_length': 14699.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.39530590176582336, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01898171380162239, 'sampling/sampling_logp_difference/max': 3.6255531311035156, 'sampling/importance_sampling_ratio/min': 0.02663435973227024, 'sampling/importance_sampling_ratio/mean': 1.0000395774841309, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.043066801135865e-05, 'epoch': 0.19}
+
+ 21%|██        | 211/1024 [8:37:44<34:13:20, 151.54s/it][AINFO 12-01 04:09:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:09:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:09:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:09:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██        | 212/1024 [8:40:44<36:08:02, 160.20s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.004261106252670288, 'learning_rate': 1e-05, 'num_tokens': 164892470.0, 'completions/mean_length': 7181.125, 'completions/min_length': 483.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6807.02392578125, 'completions/min_terminated_length': 483.0, 'completions/max_terminated_length': 16297.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020697621628642082, 'sampling/sampling_logp_difference/max': 3.5989818572998047, 'sampling/importance_sampling_ratio/min': 0.027351556345820427, 'sampling/importance_sampling_ratio/mean': 1.000020146369934, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.8043594511473202e-05, 'epoch': 0.2}
+
+ 21%|██        | 212/1024 [8:40:44<36:08:02, 160.20s/it][AINFO 12-01 04:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:12:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:12:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██        | 213/1024 [8:43:07<34:55:33, 155.04s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.00418127840384841, 'learning_rate': 1e-05, 'num_tokens': 165566367.0, 'completions/mean_length': 5109.1328125, 'completions/min_length': 228.0, 'completions/max_length': 15698.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5109.1328125, 'completions/min_terminated_length': 228.0, 'completions/max_terminated_length': 15698.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.22461533546447754, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018262382596731186, 'sampling/sampling_logp_difference/max': 1.881803274154663, 'sampling/importance_sampling_ratio/min': 0.1523151844739914, 'sampling/importance_sampling_ratio/mean': 1.0000314712524414, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.96704689364924e-05, 'epoch': 0.2}
+
+ 21%|██        | 213/1024 [8:43:07<34:55:33, 155.04s/it][AINFO 12-01 04:14:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:14:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:14:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:14:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██        | 214/1024 [8:45:38<34:37:21, 153.88s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0033189826644957066, 'learning_rate': 1e-05, 'num_tokens': 166376962.0, 'completions/mean_length': 6181.2109375, 'completions/min_length': 811.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6019.26220703125, 'completions/min_terminated_length': 811.0, 'completions/max_terminated_length': 15864.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.31298601627349854, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019347943365573883, 'sampling/sampling_logp_difference/max': 2.943303108215332, 'sampling/importance_sampling_ratio/min': 0.052691396325826645, 'sampling/importance_sampling_ratio/mean': 0.9998948574066162, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7822361832695606e-05, 'epoch': 0.2}
+
+ 21%|██        | 214/1024 [8:45:38<34:37:21, 153.88s/it][AINFO 12-01 04:16:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:16:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:16:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:16:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██        | 215/1024 [8:47:54<33:23:27, 148.59s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016709774499759078, 'learning_rate': 1e-05, 'num_tokens': 167157640.0, 'completions/mean_length': 5951.484375, 'completions/min_length': 348.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5785.88916015625, 'completions/min_terminated_length': 348.0, 'completions/max_terminated_length': 13161.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3043339252471924, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017627622932195663, 'sampling/sampling_logp_difference/max': 2.7241334915161133, 'sampling/importance_sampling_ratio/min': 0.16975867748260498, 'sampling/importance_sampling_ratio/mean': 0.9999423027038574, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.115606157029106e-05, 'epoch': 0.2}
+
+ 21%|██        | 215/1024 [8:47:54<33:23:27, 148.59s/it][AINFO 12-01 04:19:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:19:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:19:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:19:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██        | 216/1024 [8:50:53<35:21:52, 157.56s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0016149443108588457, 'learning_rate': 1e-05, 'num_tokens': 168000145.0, 'completions/mean_length': 6427.5703125, 'completions/min_length': 1034.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6188.6162109375, 'completions/min_terminated_length': 1034.0, 'completions/max_terminated_length': 16379.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02029590867459774, 'sampling/sampling_logp_difference/max': 7.21126651763916, 'sampling/importance_sampling_ratio/min': 0.0007382215699180961, 'sampling/importance_sampling_ratio/mean': 1.0000619888305664, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.7004604615067365e-05, 'epoch': 0.2}
+
+ 21%|██        | 216/1024 [8:50:53<35:21:52, 157.56s/it][AINFO 12-01 04:22:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:22:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:22:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:22:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██        | 217/1024 [8:53:25<34:56:05, 155.84s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0025989029090851545, 'learning_rate': 1e-05, 'num_tokens': 168751129.0, 'completions/mean_length': 5688.25, 'completions/min_length': 699.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5518.4765625, 'completions/min_terminated_length': 699.0, 'completions/max_terminated_length': 16172.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2093481421470642, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019034378230571747, 'sampling/sampling_logp_difference/max': 5.047535419464111, 'sampling/importance_sampling_ratio/min': 0.006425149273127317, 'sampling/importance_sampling_ratio/mean': 1.0000039339065552, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.391117033719638e-05, 'epoch': 0.2}
+
+ 21%|██        | 217/1024 [8:53:25<34:56:05, 155.84s/it][AINFO 12-01 04:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:24:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██▏       | 218/1024 [8:55:49<34:05:52, 152.30s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0014395297039300203, 'learning_rate': 1e-05, 'num_tokens': 169651958.0, 'completions/mean_length': 6900.7890625, 'completions/min_length': 1056.0, 'completions/max_length': 14658.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6900.7890625, 'completions/min_terminated_length': 1056.0, 'completions/max_terminated_length': 14658.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018791884183883667, 'sampling/sampling_logp_difference/max': 1.6278386116027832, 'sampling/importance_sampling_ratio/min': 0.1963535100221634, 'sampling/importance_sampling_ratio/mean': 0.9999902248382568, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.91737212339649e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 218/1024 [8:55:49<34:05:52, 152.30s/it][AINFO 12-01 04:27:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:27:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:27:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:27:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██▏       | 219/1024 [8:58:33<34:53:29, 156.04s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.00163712864741683, 'learning_rate': 1e-05, 'num_tokens': 170644834.0, 'completions/mean_length': 7555.78125, 'completions/min_length': 1141.0, 'completions/max_length': 16053.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7555.78125, 'completions/min_terminated_length': 1141.0, 'completions/max_terminated_length': 16053.0, 'rewards/accuracy_reward/mean': 0.125, 'rewards/accuracy_reward/std': 0.3320184051990509, 'reward': 0.125, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022381821647286415, 'sampling/sampling_logp_difference/max': 3.050140857696533, 'sampling/importance_sampling_ratio/min': 0.047352250665426254, 'sampling/importance_sampling_ratio/mean': 0.999955415725708, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.946223228012968e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 219/1024 [8:58:33<34:53:29, 156.04s/it][AINFO 12-01 04:29:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:29:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:29:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:29:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 21%|██▏       | 220/1024 [9:01:09<34:50:47, 156.03s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0027624722570180893, 'learning_rate': 1e-05, 'num_tokens': 171507591.0, 'completions/mean_length': 6611.7890625, 'completions/min_length': 552.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6214.54443359375, 'completions/min_terminated_length': 552.0, 'completions/max_terminated_length': 15256.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2927239239215851, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01914219744503498, 'sampling/sampling_logp_difference/max': 2.26084041595459, 'sampling/importance_sampling_ratio/min': 0.1042628288269043, 'sampling/importance_sampling_ratio/mean': 1.0000615119934082, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3970543856630684e-05, 'epoch': 0.2}
+
+ 21%|██▏       | 220/1024 [9:01:09<34:50:47, 156.03s/it][AINFO 12-01 04:32:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:32:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:32:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:32:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 221/1024 [9:03:41<34:31:31, 154.78s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0006589235854335129, 'learning_rate': 1e-05, 'num_tokens': 172371733.0, 'completions/mean_length': 6598.796875, 'completions/min_length': 482.0, 'completions/max_length': 16067.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6598.796875, 'completions/min_terminated_length': 482.0, 'completions/max_terminated_length': 16067.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.22726887464523315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020106088370084763, 'sampling/sampling_logp_difference/max': 2.9778122901916504, 'sampling/importance_sampling_ratio/min': 0.05090407282114029, 'sampling/importance_sampling_ratio/mean': 0.9999136328697205, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6926457028130244e-05, 'epoch': 0.2}
+
+ 22%|██▏       | 221/1024 [9:03:41<34:31:31, 154.78s/it][AINFO 12-01 04:34:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:34:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:34:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:34:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 222/1024 [9:06:32<35:33:58, 159.65s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019011077238246799, 'learning_rate': 1e-05, 'num_tokens': 173359131.0, 'completions/mean_length': 7566.671875, 'completions/min_length': 1187.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7056.578125, 'completions/min_terminated_length': 1187.0, 'completions/max_terminated_length': 16143.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2964431941509247, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018425267189741135, 'sampling/sampling_logp_difference/max': 6.182703495025635, 'sampling/importance_sampling_ratio/min': 0.0020648380741477013, 'sampling/importance_sampling_ratio/mean': 0.9999247789382935, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.776323152735131e-05, 'epoch': 0.2}
+
+ 22%|██▏       | 222/1024 [9:06:32<35:33:58, 159.65s/it][AINFO 12-01 04:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:37:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:37:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 223/1024 [9:09:21<36:09:10, 162.48s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013812799006700516, 'learning_rate': 1e-05, 'num_tokens': 174294187.0, 'completions/mean_length': 7149.25, 'completions/min_length': 1497.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6773.853515625, 'completions/min_terminated_length': 1497.0, 'completions/max_terminated_length': 16366.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018315069377422333, 'sampling/sampling_logp_difference/max': 2.5327765941619873, 'sampling/importance_sampling_ratio/min': 0.07943814992904663, 'sampling/importance_sampling_ratio/mean': 0.9999502897262573, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.61769475350593e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 223/1024 [9:09:21<36:09:10, 162.48s/it][AINFO 12-01 04:40:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:40:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:40:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:40:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 224/1024 [9:12:04<36:06:12, 162.47s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0018499833531677723, 'learning_rate': 1e-05, 'num_tokens': 175249105.0, 'completions/mean_length': 7313.984375, 'completions/min_length': 1495.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6945.2841796875, 'completions/min_terminated_length': 1495.0, 'completions/max_terminated_length': 16357.0, 'rewards/accuracy_reward/mean': 0.6171875, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.6171875, 'reward_std': 0.22567616403102875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01686716079711914, 'sampling/sampling_logp_difference/max': 3.5822319984436035, 'sampling/importance_sampling_ratio/min': 0.02781354822218418, 'sampling/importance_sampling_ratio/mean': 1.0000309944152832, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.8821720863779774e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 224/1024 [9:12:04<36:06:12, 162.47s/it][AINFO 12-01 04:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:43:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 22%|██▏       | 225/1024 [9:15:09<37:32:17, 169.13s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001465097302570939, 'learning_rate': 1e-05, 'num_tokens': 176162907.0, 'completions/mean_length': 7008.453125, 'completions/min_length': 733.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6783.4404296875, 'completions/min_terminated_length': 733.0, 'completions/max_terminated_length': 16356.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020364444702863693, 'sampling/sampling_logp_difference/max': 2.5734238624572754, 'sampling/importance_sampling_ratio/min': 0.07627394795417786, 'sampling/importance_sampling_ratio/mean': 0.9999629259109497, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.4807232004350226e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 225/1024 [9:15:09<37:32:17, 169.13s/it][AINFO 12-01 04:46:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:46:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:46:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:46:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 226/1024 [9:17:49<36:55:47, 166.60s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002936703385785222, 'learning_rate': 1e-05, 'num_tokens': 177051819.0, 'completions/mean_length': 6789.3125, 'completions/min_length': 931.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6149.6669921875, 'completions/min_terminated_length': 931.0, 'completions/max_terminated_length': 14964.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.27275341749191284, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019227582961320877, 'sampling/sampling_logp_difference/max': 1.9446885585784912, 'sampling/importance_sampling_ratio/min': 0.1430317610502243, 'sampling/importance_sampling_ratio/mean': 1.000022053718567, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.858313573346095e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 226/1024 [9:17:49<36:55:47, 166.60s/it][AINFO 12-01 04:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:49:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 227/1024 [9:20:14<35:23:50, 159.89s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0022553636226803064, 'learning_rate': 1e-05, 'num_tokens': 177894170.0, 'completions/mean_length': 6421.4921875, 'completions/min_length': 103.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6343.04736328125, 'completions/min_terminated_length': 103.0, 'completions/max_terminated_length': 15078.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3829345107078552, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019709140062332153, 'sampling/sampling_logp_difference/max': 1.8448920249938965, 'sampling/importance_sampling_ratio/min': 0.15804238617420197, 'sampling/importance_sampling_ratio/mean': 1.0000286102294922, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.5176093155751005e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 227/1024 [9:20:14<35:23:50, 159.89s/it][AINFO 12-01 04:51:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:51:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:51:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:51:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 228/1024 [9:22:40<34:28:43, 155.93s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0018756220815703273, 'learning_rate': 1e-05, 'num_tokens': 178751365.0, 'completions/mean_length': 6553.9609375, 'completions/min_length': 621.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6397.9287109375, 'completions/min_terminated_length': 621.0, 'completions/max_terminated_length': 15072.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019149571657180786, 'sampling/sampling_logp_difference/max': 10.531834602355957, 'sampling/importance_sampling_ratio/min': 2.667364606168121e-05, 'sampling/importance_sampling_ratio/mean': 0.9999616146087646, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.218978563130804e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 228/1024 [9:22:40<34:28:43, 155.93s/it][AINFO 12-01 04:53:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:53:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:53:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:53:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 229/1024 [9:24:59<33:19:11, 150.88s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.004922767635434866, 'learning_rate': 1e-05, 'num_tokens': 179453403.0, 'completions/mean_length': 5311.921875, 'completions/min_length': 618.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5224.740234375, 'completions/min_terminated_length': 618.0, 'completions/max_terminated_length': 13300.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02013929933309555, 'sampling/sampling_logp_difference/max': 5.425407886505127, 'sampling/importance_sampling_ratio/min': 0.004403269849717617, 'sampling/importance_sampling_ratio/mean': 1.000014066696167, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.151799990064319e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 229/1024 [9:24:59<33:19:11, 150.88s/it][AINFO 12-01 04:56:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:56:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:56:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:56:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 22%|██▏       | 230/1024 [9:27:19<32:32:37, 147.55s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.003075424814596772, 'learning_rate': 1e-05, 'num_tokens': 180190806.0, 'completions/mean_length': 5607.4609375, 'completions/min_length': 266.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5436.4052734375, 'completions/min_terminated_length': 266.0, 'completions/max_terminated_length': 15893.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016923140734434128, 'sampling/sampling_logp_difference/max': 1.931182861328125, 'sampling/importance_sampling_ratio/min': 0.14497661590576172, 'sampling/importance_sampling_ratio/mean': 0.9999542236328125, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.974678472535743e-05, 'epoch': 0.21}
+
+ 22%|██▏       | 230/1024 [9:27:19<32:32:37, 147.55s/it][AINFO 12-01 04:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:58:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 04:58:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 231/1024 [9:29:41<32:07:29, 145.84s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001226920518092811, 'learning_rate': 1e-05, 'num_tokens': 181015421.0, 'completions/mean_length': 6297.1171875, 'completions/min_length': 1080.0, 'completions/max_length': 14691.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6297.1171875, 'completions/min_terminated_length': 1080.0, 'completions/max_terminated_length': 14691.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018348123878240585, 'sampling/sampling_logp_difference/max': 5.759917259216309, 'sampling/importance_sampling_ratio/min': 0.0031513723079115152, 'sampling/importance_sampling_ratio/mean': 1.0000073909759521, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6122803521720925e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 231/1024 [9:29:41<32:07:29, 145.84s/it][AINFO 12-01 05:00:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:00:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:00:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:00:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 232/1024 [9:32:40<34:18:10, 155.92s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015028339112177491, 'learning_rate': 1e-05, 'num_tokens': 181906557.0, 'completions/mean_length': 6808.0, 'completions/min_length': 932.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6499.0966796875, 'completions/min_terminated_length': 932.0, 'completions/max_terminated_length': 16102.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2817177176475525, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021416563540697098, 'sampling/sampling_logp_difference/max': 6.303047180175781, 'sampling/importance_sampling_ratio/min': 0.0018307177815586329, 'sampling/importance_sampling_ratio/mean': 1.0000405311584473, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 9.03685236153251e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 232/1024 [9:32:40<34:18:10, 155.92s/it][AINFO 12-01 05:03:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:03:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:03:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:03:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 233/1024 [9:35:18<34:21:20, 156.36s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0012138820020481944, 'learning_rate': 1e-05, 'num_tokens': 182770378.0, 'completions/mean_length': 6588.3515625, 'completions/min_length': 490.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6353.25634765625, 'completions/min_terminated_length': 490.0, 'completions/max_terminated_length': 14188.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.17700131237506866, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02050863578915596, 'sampling/sampling_logp_difference/max': 2.7643141746520996, 'sampling/importance_sampling_ratio/min': 0.06301930546760559, 'sampling/importance_sampling_ratio/mean': 1.000015377998352, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.841858713509282e-05, 'epoch': 0.21}
+
+ 23%|██▎       | 233/1024 [9:35:18<34:21:20, 156.36s/it][AINFO 12-01 05:06:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:06:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:06:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:06:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 234/1024 [9:37:29<32:40:20, 148.89s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0017628915375098586, 'learning_rate': 1e-05, 'num_tokens': 183598614.0, 'completions/mean_length': 6280.40625, 'completions/min_length': 396.0, 'completions/max_length': 15060.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6280.40625, 'completions/min_terminated_length': 396.0, 'completions/max_terminated_length': 15060.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01853768154978752, 'sampling/sampling_logp_difference/max': 3.696566581726074, 'sampling/importance_sampling_ratio/min': 0.02480855956673622, 'sampling/importance_sampling_ratio/mean': 0.999995768070221, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1229169962898595e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 234/1024 [9:37:29<32:40:20, 148.89s/it][AINFO 12-01 05:08:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:08:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:08:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:08:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 235/1024 [9:40:05<33:04:08, 150.89s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0010316594270989299, 'learning_rate': 1e-05, 'num_tokens': 184492896.0, 'completions/mean_length': 6837.015625, 'completions/min_length': 898.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6761.84228515625, 'completions/min_terminated_length': 898.0, 'completions/max_terminated_length': 15454.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020229589194059372, 'sampling/sampling_logp_difference/max': 4.058311939239502, 'sampling/importance_sampling_ratio/min': 0.017278160899877548, 'sampling/importance_sampling_ratio/mean': 1.000056505203247, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.6980187840308645e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 235/1024 [9:40:05<33:04:08, 150.89s/it][AINFO 12-01 05:11:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:11:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:11:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:11:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 236/1024 [9:42:49<33:54:24, 154.90s/it][A
+                                                        [A{'loss': -0.0001, 'grad_norm': 0.001735079800710082, 'learning_rate': 1e-05, 'num_tokens': 185409060.0, 'completions/mean_length': 7013.65625, 'completions/min_length': 860.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6788.7685546875, 'completions/min_terminated_length': 860.0, 'completions/max_terminated_length': 15964.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2982654273509979, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020225204527378082, 'sampling/sampling_logp_difference/max': 2.133953094482422, 'sampling/importance_sampling_ratio/min': 0.11836844682693481, 'sampling/importance_sampling_ratio/mean': 1.0000355243682861, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.525792582579015e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 236/1024 [9:42:49<33:54:24, 154.90s/it][AINFO 12-01 05:14:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:14:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:14:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:14:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 237/1024 [9:45:35<34:37:16, 158.37s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002296927385032177, 'learning_rate': 1e-05, 'num_tokens': 186278737.0, 'completions/mean_length': 6657.1015625, 'completions/min_length': 1281.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6502.70654296875, 'completions/min_terminated_length': 1281.0, 'completions/max_terminated_length': 14725.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.33797892928123474, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01761571317911148, 'sampling/sampling_logp_difference/max': 2.06707763671875, 'sampling/importance_sampling_ratio/min': 0.12655507028102875, 'sampling/importance_sampling_ratio/mean': 1.0000171661376953, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7247865950339474e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 237/1024 [9:45:35<34:37:16, 158.37s/it][AINFO 12-01 05:16:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:16:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:16:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:16:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 238/1024 [9:48:01<33:45:50, 154.64s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0015714693581685424, 'learning_rate': 1e-05, 'num_tokens': 187186130.0, 'completions/mean_length': 6948.0078125, 'completions/min_length': 901.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6873.70849609375, 'completions/min_terminated_length': 901.0, 'completions/max_terminated_length': 15660.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.22962789237499237, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021574918180704117, 'sampling/sampling_logp_difference/max': 1.9929981231689453, 'sampling/importance_sampling_ratio/min': 0.136286199092865, 'sampling/importance_sampling_ratio/mean': 1.0000017881393433, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.572205762087833e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 238/1024 [9:48:01<33:45:50, 154.64s/it][AINFO 12-01 05:19:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:19:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:19:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:19:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 239/1024 [9:50:16<32:25:27, 148.70s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019131326116621494, 'learning_rate': 1e-05, 'num_tokens': 187937506.0, 'completions/mean_length': 5729.4375, 'completions/min_length': 931.0, 'completions/max_length': 16347.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5729.4375, 'completions/min_terminated_length': 931.0, 'completions/max_terminated_length': 16347.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01795974187552929, 'sampling/sampling_logp_difference/max': 4.7401862144470215, 'sampling/importance_sampling_ratio/min': 0.008737019263207912, 'sampling/importance_sampling_ratio/mean': 0.9999916553497314, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.8961928282787994e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 239/1024 [9:50:16<32:25:27, 148.70s/it][AINFO 12-01 05:21:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:21:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:21:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:21:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 23%|██▎       | 240/1024 [9:53:13<34:13:15, 157.14s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0024941260926425457, 'learning_rate': 1e-05, 'num_tokens': 188795581.0, 'completions/mean_length': 6553.7734375, 'completions/min_length': 539.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6070.3193359375, 'completions/min_terminated_length': 539.0, 'completions/max_terminated_length': 16308.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020237412303686142, 'sampling/sampling_logp_difference/max': 2.4643564224243164, 'sampling/importance_sampling_ratio/min': 0.08506356179714203, 'sampling/importance_sampling_ratio/mean': 0.9999595880508423, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.947895013174275e-05, 'epoch': 0.22}
+
+ 23%|██▎       | 240/1024 [9:53:13<34:13:15, 157.14s/it][AINFO 12-01 05:24:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:24:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:24:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:24:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▎       | 241/1024 [9:56:01<34:51:10, 160.24s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0012391218915581703, 'learning_rate': 1e-05, 'num_tokens': 189718783.0, 'completions/mean_length': 7032.453125, 'completions/min_length': 314.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6958.81884765625, 'completions/min_terminated_length': 314.0, 'completions/max_terminated_length': 16130.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2569572627544403, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020489905029535294, 'sampling/sampling_logp_difference/max': 6.113000869750977, 'sampling/importance_sampling_ratio/min': 0.002213897183537483, 'sampling/importance_sampling_ratio/mean': 1.0001022815704346, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.435635653659119e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 241/1024 [9:56:01<34:51:10, 160.24s/it][AINFO 12-01 05:27:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:27:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:27:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:27:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▎       | 242/1024 [9:58:37<34:34:09, 159.14s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0012024587485939264, 'learning_rate': 1e-05, 'num_tokens': 190556185.0, 'completions/mean_length': 6401.578125, 'completions/min_length': 705.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6162.00048828125, 'completions/min_terminated_length': 705.0, 'completions/max_terminated_length': 15407.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.27168765664100647, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019056081771850586, 'sampling/sampling_logp_difference/max': 2.3984804153442383, 'sampling/importance_sampling_ratio/min': 0.0908559113740921, 'sampling/importance_sampling_ratio/mean': 0.9999680519104004, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.9562011554371566e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 242/1024 [9:58:37<34:34:09, 159.14s/it][AINFO 12-01 05:29:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:29:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:29:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:29:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▎       | 243/1024 [10:01:24<35:03:20, 161.59s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010284221498295665, 'learning_rate': 1e-05, 'num_tokens': 191437679.0, 'completions/mean_length': 6732.734375, 'completions/min_length': 821.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6656.740234375, 'completions/min_terminated_length': 821.0, 'completions/max_terminated_length': 14946.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021014994010329247, 'sampling/sampling_logp_difference/max': 5.777632236480713, 'sampling/importance_sampling_ratio/min': 0.003096037544310093, 'sampling/importance_sampling_ratio/mean': 0.9999179840087891, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5083246504873387e-05, 'epoch': 0.22}
+
+ 24%|██▎       | 243/1024 [10:01:24<35:03:20, 161.59s/it][AINFO 12-01 05:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:32:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▍       | 244/1024 [10:03:53<34:10:08, 157.70s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0025586909614503384, 'learning_rate': 1e-05, 'num_tokens': 192234493.0, 'completions/mean_length': 6085.546875, 'completions/min_length': 721.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5753.33837890625, 'completions/min_terminated_length': 721.0, 'completions/max_terminated_length': 15579.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3724474310874939, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01826568692922592, 'sampling/sampling_logp_difference/max': 4.899214744567871, 'sampling/importance_sampling_ratio/min': 0.007452432531863451, 'sampling/importance_sampling_ratio/mean': 1.0000081062316895, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.873016198165715e-05, 'epoch': 0.22}
+
+ 24%|██▍       | 244/1024 [10:03:53<34:10:08, 157.70s/it][AINFO 12-01 05:35:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:35:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:35:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:35:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▍       | 245/1024 [10:06:35<34:21:57, 158.82s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003359193215146661, 'learning_rate': 1e-05, 'num_tokens': 193106318.0, 'completions/mean_length': 6686.3203125, 'completions/min_length': 429.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6373.49169921875, 'completions/min_terminated_length': 429.0, 'completions/max_terminated_length': 15745.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.31300368905067444, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01838621497154236, 'sampling/sampling_logp_difference/max': 18.791616439819336, 'sampling/importance_sampling_ratio/min': 6.900883420257742e-09, 'sampling/importance_sampling_ratio/mean': 0.9999434947967529, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.408609604273806e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 245/1024 [10:06:35<34:21:57, 158.82s/it][AINFO 12-01 05:37:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:37:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:37:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:37:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▍       | 246/1024 [10:08:51<32:51:57, 152.08s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.003694899147376418, 'learning_rate': 1e-05, 'num_tokens': 193841919.0, 'completions/mean_length': 5599.5703125, 'completions/min_length': 930.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5514.6533203125, 'completions/min_terminated_length': 930.0, 'completions/max_terminated_length': 15409.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.32719242572784424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020069165155291557, 'sampling/sampling_logp_difference/max': 2.2008254528045654, 'sampling/importance_sampling_ratio/min': 0.1107117310166359, 'sampling/importance_sampling_ratio/mean': 1.0000014305114746, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.688518396913423e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 246/1024 [10:08:51<32:51:57, 152.08s/it][AINFO 12-01 05:40:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:40:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:40:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:40:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▍       | 247/1024 [10:11:28<33:08:11, 153.53s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018113526748493314, 'learning_rate': 1e-05, 'num_tokens': 194827738.0, 'completions/mean_length': 7528.2734375, 'completions/min_length': 1015.0, 'completions/max_length': 16089.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7528.2734375, 'completions/min_terminated_length': 1015.0, 'completions/max_terminated_length': 16089.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.22908622026443481, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021598057821393013, 'sampling/sampling_logp_difference/max': 2.591005325317383, 'sampling/importance_sampling_ratio/min': 0.07494466006755829, 'sampling/importance_sampling_ratio/mean': 0.9999699592590332, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.403608429332962e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 247/1024 [10:11:28<33:08:11, 153.53s/it][AINFO 12-01 05:42:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:42:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:42:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:42:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▍       | 248/1024 [10:14:33<35:07:44, 162.97s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019382642349228263, 'learning_rate': 1e-05, 'num_tokens': 195855937.0, 'completions/mean_length': 7871.1796875, 'completions/min_length': 980.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7525.1298828125, 'completions/min_terminated_length': 980.0, 'completions/max_terminated_length': 16249.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0206658523529768, 'sampling/sampling_logp_difference/max': 3.7663896083831787, 'sampling/importance_sampling_ratio/min': 0.023135442286729813, 'sampling/importance_sampling_ratio/mean': 1.0000072717666626, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.830236523252097e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 248/1024 [10:14:33<35:07:44, 162.97s/it][AINFO 12-01 05:45:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:45:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:45:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:45:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▍       | 249/1024 [10:17:05<34:21:44, 159.62s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016805005725473166, 'learning_rate': 1e-05, 'num_tokens': 196824866.0, 'completions/mean_length': 7408.2578125, 'completions/min_length': 1035.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7265.7861328125, 'completions/min_terminated_length': 1035.0, 'completions/max_terminated_length': 14007.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021476035937666893, 'sampling/sampling_logp_difference/max': 3.1956331729888916, 'sampling/importance_sampling_ratio/min': 0.04094059392809868, 'sampling/importance_sampling_ratio/mean': 1.0001047849655151, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.66432447360421e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 249/1024 [10:17:05<34:21:44, 159.62s/it][AINFO 12-01 05:48:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:48:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:48:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:48:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 24%|██▍       | 250/1024 [10:20:12<36:05:05, 167.84s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001989413285627961, 'learning_rate': 1e-05, 'num_tokens': 197843919.0, 'completions/mean_length': 7826.1015625, 'completions/min_length': 1122.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7620.71240234375, 'completions/min_terminated_length': 1122.0, 'completions/max_terminated_length': 16156.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3963618278503418, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01953016221523285, 'sampling/sampling_logp_difference/max': 3.515831708908081, 'sampling/importance_sampling_ratio/min': 0.029723070561885834, 'sampling/importance_sampling_ratio/mean': 1.0000919103622437, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.725215462211054e-05, 'epoch': 0.23}
+
+ 24%|██▍       | 250/1024 [10:20:12<36:05:05, 167.84s/it][AINFO 12-01 05:51:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:51:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:51:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:51:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▍       | 251/1024 [10:23:05<36:24:09, 169.53s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001489010639488697, 'learning_rate': 1e-05, 'num_tokens': 198837012.0, 'completions/mean_length': 7631.9765625, 'completions/min_length': 654.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7048.5087890625, 'completions/min_terminated_length': 654.0, 'completions/max_terminated_length': 16311.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01829022541642189, 'sampling/sampling_logp_difference/max': 3.1050760746002197, 'sampling/importance_sampling_ratio/min': 0.044821109622716904, 'sampling/importance_sampling_ratio/mean': 1.0000070333480835, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.778014726980473e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 251/1024 [10:23:05<36:24:09, 169.53s/it][AINFO 12-01 05:54:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:54:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:54:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:54:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 25%|██▍       | 252/1024 [10:25:27<34:35:45, 161.33s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0025498855393379927, 'learning_rate': 1e-05, 'num_tokens': 199699184.0, 'completions/mean_length': 6576.28125, 'completions/min_length': 528.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6499.05517578125, 'completions/min_terminated_length': 528.0, 'completions/max_terminated_length': 13863.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021767770871520042, 'sampling/sampling_logp_difference/max': 3.4964394569396973, 'sampling/importance_sampling_ratio/min': 0.03030509315431118, 'sampling/importance_sampling_ratio/mean': 0.9999722242355347, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9525100330829446e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 252/1024 [10:25:27<34:35:45, 161.33s/it][AINFO 12-01 05:56:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:56:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:56:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:56:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▍       | 253/1024 [10:28:01<34:05:15, 159.16s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001495356555096805, 'learning_rate': 1e-05, 'num_tokens': 200563749.0, 'completions/mean_length': 6608.8515625, 'completions/min_length': 872.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6531.8818359375, 'completions/min_terminated_length': 872.0, 'completions/max_terminated_length': 15803.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.34982627630233765, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019046209752559662, 'sampling/sampling_logp_difference/max': 3.274538040161133, 'sampling/importance_sampling_ratio/min': 0.03783434256911278, 'sampling/importance_sampling_ratio/mean': 0.9999645948410034, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.036466277237196e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 253/1024 [10:28:01<34:05:15, 159.16s/it][AINFO 12-01 05:59:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:59:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:59:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 05:59:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▍       | 254/1024 [10:30:39<33:56:16, 158.67s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012633223086595535, 'learning_rate': 1e-05, 'num_tokens': 201468274.0, 'completions/mean_length': 6907.6015625, 'completions/min_length': 661.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6832.984375, 'completions/min_terminated_length': 661.0, 'completions/max_terminated_length': 14627.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020826399326324463, 'sampling/sampling_logp_difference/max': 3.0150129795074463, 'sampling/importance_sampling_ratio/min': 0.049045197665691376, 'sampling/importance_sampling_ratio/mean': 0.9999303221702576, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.709591123879363e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 254/1024 [10:30:39<33:56:16, 158.67s/it][AINFO 12-01 06:01:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:01:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:01:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:01:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▍       | 255/1024 [10:33:24<34:16:54, 160.49s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018011360662057996, 'learning_rate': 1e-05, 'num_tokens': 202352751.0, 'completions/mean_length': 6749.4765625, 'completions/min_length': 614.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6596.5478515625, 'completions/min_terminated_length': 614.0, 'completions/max_terminated_length': 14939.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.29826053977012634, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018739810213446617, 'sampling/sampling_logp_difference/max': 2.5921852588653564, 'sampling/importance_sampling_ratio/min': 0.07485628128051758, 'sampling/importance_sampling_ratio/mean': 1.0000749826431274, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2891245104547124e-05, 'epoch': 0.23}
+
+ 25%|██▍       | 255/1024 [10:33:24<34:16:54, 160.49s/it][AINFO 12-01 06:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:04:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:04:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 256/1024 [10:36:14<34:53:16, 163.54s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002084659179672599, 'learning_rate': 1e-05, 'num_tokens': 203278954.0, 'completions/mean_length': 7077.5859375, 'completions/min_length': 566.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6699.2763671875, 'completions/min_terminated_length': 566.0, 'completions/max_terminated_length': 16166.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.24883407354354858, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018939774483442307, 'sampling/sampling_logp_difference/max': 3.8118247985839844, 'sampling/importance_sampling_ratio/min': 0.022107800468802452, 'sampling/importance_sampling_ratio/mean': 1.0000278949737549, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.712265897273028e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 256/1024 [10:36:14<34:53:16, 163.54s/it][AINFO 12-01 06:07:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:07:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:07:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:07:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+[OpenTinker] 2025-12-01 06:09:13,942 - math_verify.grader - WARNING - Timeout during comparison
+
+ 25%|██▌       | 257/1024 [10:38:50<34:22:10, 161.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017338074976578355, 'learning_rate': 1e-05, 'num_tokens': 204154456.0, 'completions/mean_length': 6682.671875, 'completions/min_length': 674.0, 'completions/max_length': 15958.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6682.671875, 'completions/min_terminated_length': 674.0, 'completions/max_terminated_length': 15958.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3713865876197815, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020058799535036087, 'sampling/sampling_logp_difference/max': 4.164816856384277, 'sampling/importance_sampling_ratio/min': 0.015532558783888817, 'sampling/importance_sampling_ratio/mean': 0.9999614953994751, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.474514248111518e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 257/1024 [10:38:50<34:22:10, 161.32s/it][AINFO 12-01 06:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:10:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 258/1024 [10:41:45<35:09:55, 165.27s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.00369207002222538, 'learning_rate': 1e-05, 'num_tokens': 205056570.0, 'completions/mean_length': 6869.765625, 'completions/min_length': 309.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6641.42431640625, 'completions/min_terminated_length': 309.0, 'completions/max_terminated_length': 15816.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018504276871681213, 'sampling/sampling_logp_difference/max': 3.0091819763183594, 'sampling/importance_sampling_ratio/min': 0.049332018941640854, 'sampling/importance_sampling_ratio/mean': 0.9999786019325256, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.921131196373608e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 258/1024 [10:41:45<35:09:55, 165.27s/it][AINFO 12-01 06:13:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:13:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:13:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:13:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 259/1024 [10:44:36<35:30:24, 167.09s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017463711556047201, 'learning_rate': 1e-05, 'num_tokens': 206099943.0, 'completions/mean_length': 7985.2890625, 'completions/min_length': 286.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7714.67724609375, 'completions/min_terminated_length': 286.0, 'completions/max_terminated_length': 16165.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021547015756368637, 'sampling/sampling_logp_difference/max': 2.623727798461914, 'sampling/importance_sampling_ratio/min': 0.07253197580575943, 'sampling/importance_sampling_ratio/mean': 0.9999598264694214, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.0019783884636126e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 259/1024 [10:44:36<35:30:24, 167.09s/it][AINFO 12-01 06:15:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:15:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:15:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:15:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 260/1024 [10:47:44<36:47:13, 173.34s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001173493335954845, 'learning_rate': 1e-05, 'num_tokens': 206994067.0, 'completions/mean_length': 6825.46875, 'completions/min_length': 711.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6750.20458984375, 'completions/min_terminated_length': 711.0, 'completions/max_terminated_length': 16298.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.20858672261238098, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020518729463219643, 'sampling/sampling_logp_difference/max': 6.795599937438965, 'sampling/importance_sampling_ratio/min': 0.0011186866322532296, 'sampling/importance_sampling_ratio/mean': 0.9999593496322632, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1258084902674454e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 260/1024 [10:47:44<36:47:13, 173.34s/it][AINFO 12-01 06:19:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:19:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:19:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:19:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 25%|██▌       | 261/1024 [10:50:14<35:13:31, 166.20s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0027126881759613752, 'learning_rate': 1e-05, 'num_tokens': 207853158.0, 'completions/mean_length': 6577.8359375, 'completions/min_length': 797.0, 'completions/max_length': 16053.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6577.8359375, 'completions/min_terminated_length': 797.0, 'completions/max_terminated_length': 16053.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.35718512535095215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01916714943945408, 'sampling/sampling_logp_difference/max': 6.088956832885742, 'sampling/importance_sampling_ratio/min': 0.002267773263156414, 'sampling/importance_sampling_ratio/mean': 1.000032901763916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.904602762209834e-05, 'epoch': 0.24}
+
+ 25%|██▌       | 261/1024 [10:50:14<35:13:31, 166.20s/it][AINFO 12-01 06:21:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:21:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:21:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:21:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▌       | 262/1024 [10:52:45<34:14:14, 161.75s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003408671822398901, 'learning_rate': 1e-05, 'num_tokens': 208711173.0, 'completions/mean_length': 6546.9296875, 'completions/min_length': 627.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6310.84033203125, 'completions/min_terminated_length': 627.0, 'completions/max_terminated_length': 15249.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.34983116388320923, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01826491206884384, 'sampling/sampling_logp_difference/max': 5.198025703430176, 'sampling/importance_sampling_ratio/min': 0.00552746606990695, 'sampling/importance_sampling_ratio/mean': 0.9999771118164062, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.331025454324845e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 262/1024 [10:52:45<34:14:14, 161.75s/it][AINFO 12-01 06:24:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:24:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:24:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:24:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▌       | 263/1024 [10:55:23<33:58:30, 160.72s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015714941546320915, 'learning_rate': 1e-05, 'num_tokens': 209636202.0, 'completions/mean_length': 7057.6015625, 'completions/min_length': 1261.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6833.7685546875, 'completions/min_terminated_length': 1261.0, 'completions/max_terminated_length': 15359.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020694922655820847, 'sampling/sampling_logp_difference/max': 3.290691375732422, 'sampling/importance_sampling_ratio/min': 0.03722810000181198, 'sampling/importance_sampling_ratio/mean': 1.0000311136245728, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.078962456333102e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 263/1024 [10:55:23<33:58:30, 160.72s/it][AINFO 12-01 06:26:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:26:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:26:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:26:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▌       | 264/1024 [10:57:47<32:48:50, 155.44s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007676694658584893, 'learning_rate': 1e-05, 'num_tokens': 210532232.0, 'completions/mean_length': 6826.296875, 'completions/min_length': 805.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6751.03955078125, 'completions/min_terminated_length': 805.0, 'completions/max_terminated_length': 14898.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01967993564903736, 'sampling/sampling_logp_difference/max': 4.1035003662109375, 'sampling/importance_sampling_ratio/min': 0.016514765098690987, 'sampling/importance_sampling_ratio/mean': 1.0000896453857422, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.150668510443211e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 264/1024 [10:57:47<32:48:50, 155.44s/it][AINFO 12-01 06:29:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:29:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:29:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:29:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▌       | 265/1024 [11:00:35<33:36:45, 159.43s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021341743413358927, 'learning_rate': 1e-05, 'num_tokens': 211474763.0, 'completions/mean_length': 7196.2109375, 'completions/min_length': 467.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6975.7041015625, 'completions/min_terminated_length': 467.0, 'completions/max_terminated_length': 15725.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2788218855857849, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019507713615894318, 'sampling/sampling_logp_difference/max': 1.9037050008773804, 'sampling/importance_sampling_ratio/min': 0.14901548624038696, 'sampling/importance_sampling_ratio/mean': 1.000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.482975120685296e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 265/1024 [11:00:35<33:36:45, 159.43s/it][AINFO 12-01 06:31:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:31:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:31:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:31:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▌       | 266/1024 [11:03:26<34:18:33, 162.95s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001817435258999467, 'learning_rate': 1e-05, 'num_tokens': 212518136.0, 'completions/mean_length': 7998.9140625, 'completions/min_length': 498.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7439.90869140625, 'completions/min_terminated_length': 498.0, 'completions/max_terminated_length': 14637.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.36113685369491577, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02061084844172001, 'sampling/sampling_logp_difference/max': 3.26106595993042, 'sampling/importance_sampling_ratio/min': 0.038347501307725906, 'sampling/importance_sampling_ratio/mean': 1.0000529289245605, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6967152431752766e-05, 'epoch': 0.24}
+
+ 26%|██▌       | 266/1024 [11:03:26<34:18:33, 162.95s/it][AINFO 12-01 06:34:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:34:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:34:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:34:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 26%|██▌       | 267/1024 [11:06:03<33:53:38, 161.19s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.002446004655212164, 'learning_rate': 1e-05, 'num_tokens': 213413360.0, 'completions/mean_length': 6820.5625, 'completions/min_length': 698.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6745.259765625, 'completions/min_terminated_length': 698.0, 'completions/max_terminated_length': 16051.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01983708143234253, 'sampling/sampling_logp_difference/max': 1.916466474533081, 'sampling/importance_sampling_ratio/min': 0.14712592959403992, 'sampling/importance_sampling_ratio/mean': 1.0000149011611938, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.440379689185647e-05, 'epoch': 0.25}
+
+ 26%|██▌       | 267/1024 [11:06:03<33:53:38, 161.19s/it][AINFO 12-01 06:37:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:37:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:37:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:37:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▌       | 268/1024 [11:08:59<34:43:32, 165.36s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002108057728037238, 'learning_rate': 1e-05, 'num_tokens': 214336032.0, 'completions/mean_length': 7064.625, 'completions/min_length': 734.0, 'completions/max_length': 16190.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7064.625, 'completions/min_terminated_length': 734.0, 'completions/max_terminated_length': 16190.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.33220988512039185, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020157091319561005, 'sampling/sampling_logp_difference/max': 3.0970616340637207, 'sampling/importance_sampling_ratio/min': 0.045181769877672195, 'sampling/importance_sampling_ratio/mean': 0.9999593496322632, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.522829764639027e-05, 'epoch': 0.25}
+
+ 26%|██▌       | 268/1024 [11:08:59<34:43:32, 165.36s/it][AINFO 12-01 06:40:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:40:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:40:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:40:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▋       | 269/1024 [11:11:39<34:22:50, 163.93s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011587693588808179, 'learning_rate': 1e-05, 'num_tokens': 215223199.0, 'completions/mean_length': 6770.1796875, 'completions/min_length': 867.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6460.05615234375, 'completions/min_terminated_length': 867.0, 'completions/max_terminated_length': 15969.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.24093356728553772, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020610256120562553, 'sampling/sampling_logp_difference/max': 15.130342483520508, 'sampling/importance_sampling_ratio/min': 2.685194431251148e-07, 'sampling/importance_sampling_ratio/mean': 0.9999446272850037, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.856469129459583e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 269/1024 [11:11:39<34:22:50, 163.93s/it][AINFO 12-01 06:42:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:42:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:42:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:42:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▋       | 270/1024 [11:14:25<34:27:55, 164.56s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0023960014805197716, 'learning_rate': 1e-05, 'num_tokens': 216191383.0, 'completions/mean_length': 7409.375, 'completions/min_length': 953.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7119.87060546875, 'completions/min_terminated_length': 953.0, 'completions/max_terminated_length': 15575.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020634867250919342, 'sampling/sampling_logp_difference/max': 2.6263370513916016, 'sampling/importance_sampling_ratio/min': 0.07234296947717667, 'sampling/importance_sampling_ratio/mean': 0.9999985694885254, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6798930384284176e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 270/1024 [11:14:25<34:27:55, 164.56s/it][AINFO 12-01 06:45:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:45:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:45:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:45:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 26%|██▋       | 271/1024 [11:16:47<33:00:05, 157.78s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.003013049950823188, 'learning_rate': 1e-05, 'num_tokens': 216978429.0, 'completions/mean_length': 5990.109375, 'completions/min_length': 751.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5908.267578125, 'completions/min_terminated_length': 751.0, 'completions/max_terminated_length': 15286.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3037971258163452, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018537553027272224, 'sampling/sampling_logp_difference/max': 2.302337169647217, 'sampling/importance_sampling_ratio/min': 0.10002478957176208, 'sampling/importance_sampling_ratio/mean': 0.9999964237213135, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.292640685183869e-05, 'epoch': 0.25}
+
+ 26%|██▋       | 271/1024 [11:16:47<33:00:05, 157.78s/it][AINFO 12-01 06:48:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:48:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:48:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:48:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 272/1024 [11:19:16<32:22:24, 154.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014529500622302294, 'learning_rate': 1e-05, 'num_tokens': 217769143.0, 'completions/mean_length': 6024.015625, 'completions/min_length': 743.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5775.37646484375, 'completions/min_terminated_length': 743.0, 'completions/max_terminated_length': 15898.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2811809182167053, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020290132611989975, 'sampling/sampling_logp_difference/max': 5.270960807800293, 'sampling/importance_sampling_ratio/min': 0.005138671025633812, 'sampling/importance_sampling_ratio/mean': 0.9999749660491943, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9645958875335054e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 272/1024 [11:19:16<32:22:24, 154.98s/it][AINFO 12-01 06:50:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:50:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:50:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:50:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 273/1024 [11:21:59<32:49:53, 157.38s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002207882469519973, 'learning_rate': 1e-05, 'num_tokens': 218738502.0, 'completions/mean_length': 7415.6171875, 'completions/min_length': 487.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7345.0, 'completions/min_terminated_length': 487.0, 'completions/max_terminated_length': 16014.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.23250605165958405, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02081884816288948, 'sampling/sampling_logp_difference/max': 2.9392521381378174, 'sampling/importance_sampling_ratio/min': 0.05290528014302254, 'sampling/importance_sampling_ratio/mean': 1.000088095664978, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.192138192138373e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 273/1024 [11:21:59<32:49:53, 157.38s/it][AINFO 12-01 06:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:53:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:53:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 274/1024 [11:24:42<33:08:05, 159.05s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0037211088929325342, 'learning_rate': 1e-05, 'num_tokens': 219670280.0, 'completions/mean_length': 7123.953125, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6976.96875, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 15406.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018946323543787003, 'sampling/sampling_logp_difference/max': 12.294368743896484, 'sampling/importance_sampling_ratio/min': 4.577448635245673e-06, 'sampling/importance_sampling_ratio/mean': 0.9999704360961914, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.689098872428076e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 274/1024 [11:24:42<33:08:05, 159.05s/it][AINFO 12-01 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:55:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 275/1024 [11:27:11<32:28:42, 156.10s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0017603947781026363, 'learning_rate': 1e-05, 'num_tokens': 220557927.0, 'completions/mean_length': 6764.3671875, 'completions/min_length': 1419.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6533.49609375, 'completions/min_terminated_length': 1419.0, 'completions/max_terminated_length': 14924.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.37191063165664673, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01833764836192131, 'sampling/sampling_logp_difference/max': 1.4987483024597168, 'sampling/importance_sampling_ratio/min': 0.2237931489944458, 'sampling/importance_sampling_ratio/mean': 1.00001060962677, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.907917824923061e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 275/1024 [11:27:11<32:28:42, 156.10s/it][AINFO 12-01 06:58:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:58:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:58:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 06:58:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 276/1024 [11:30:15<34:13:02, 164.68s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0017460085218772292, 'learning_rate': 1e-05, 'num_tokens': 221561989.0, 'completions/mean_length': 7678.546875, 'completions/min_length': 881.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7174.92529296875, 'completions/min_terminated_length': 881.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2120065689086914, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01976115256547928, 'sampling/sampling_logp_difference/max': 8.785862922668457, 'sampling/importance_sampling_ratio/min': 0.00015287914720829576, 'sampling/importance_sampling_ratio/mean': 0.999972403049469, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.248591153555026e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 276/1024 [11:30:15<34:13:02, 164.68s/it][AINFO 12-01 07:01:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:01:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:01:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:01:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 277/1024 [11:32:53<33:42:48, 162.47s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0027899756096303463, 'learning_rate': 1e-05, 'num_tokens': 222340757.0, 'completions/mean_length': 5931.5, 'completions/min_length': 755.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5506.6015625, 'completions/min_terminated_length': 755.0, 'completions/max_terminated_length': 15648.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.22461533546447754, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018298109993338585, 'sampling/sampling_logp_difference/max': 9.749932289123535, 'sampling/importance_sampling_ratio/min': 5.82986103836447e-05, 'sampling/importance_sampling_ratio/mean': 0.9999635219573975, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.309608058543745e-05, 'epoch': 0.25}
+
+ 27%|██▋       | 277/1024 [11:32:53<33:42:48, 162.47s/it][AINFO 12-01 07:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:04:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:04:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 278/1024 [11:35:32<33:28:34, 161.55s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.004263557493686676, 'learning_rate': 1e-05, 'num_tokens': 223210419.0, 'completions/mean_length': 6630.671875, 'completions/min_length': 282.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6553.8740234375, 'completions/min_terminated_length': 282.0, 'completions/max_terminated_length': 16299.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.29432153701782227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021038219332695007, 'sampling/sampling_logp_difference/max': 2.0895304679870605, 'sampling/importance_sampling_ratio/min': 0.12374521791934967, 'sampling/importance_sampling_ratio/mean': 0.9998910427093506, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.227074345588335e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 278/1024 [11:35:32<33:28:34, 161.55s/it][AINFO 12-01 07:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:06:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:06:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 279/1024 [11:37:55<32:15:54, 155.91s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0034078259486705065, 'learning_rate': 1e-05, 'num_tokens': 223909165.0, 'completions/mean_length': 5303.015625, 'completions/min_length': 424.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5127.12744140625, 'completions/min_terminated_length': 424.0, 'completions/max_terminated_length': 13142.0, 'rewards/accuracy_reward/mean': 0.6796875, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.6796875, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017712239176034927, 'sampling/sampling_logp_difference/max': 3.2824666500091553, 'sampling/importance_sampling_ratio/min': 0.03753555566072464, 'sampling/importance_sampling_ratio/mean': 0.9999855756759644, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.639338271772431e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 279/1024 [11:37:55<32:15:54, 155.91s/it][AINFO 12-01 07:09:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:09:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:09:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:09:12 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 27%|██▋       | 280/1024 [11:40:39<32:44:47, 158.45s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0010450081899762154, 'learning_rate': 1e-05, 'num_tokens': 224913392.0, 'completions/mean_length': 7704.8984375, 'completions/min_length': 106.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7202.80126953125, 'completions/min_terminated_length': 106.0, 'completions/max_terminated_length': 15900.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.29719969630241394, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017312124371528625, 'sampling/sampling_logp_difference/max': 1.7222599983215332, 'sampling/importance_sampling_ratio/min': 0.17866191267967224, 'sampling/importance_sampling_ratio/mean': 1.0000057220458984, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.253866995895805e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 280/1024 [11:40:39<32:44:47, 158.45s/it][AINFO 12-01 07:11:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:11:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:11:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:11:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 27%|██▋       | 281/1024 [11:43:08<32:06:53, 155.60s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002399445977061987, 'learning_rate': 1e-05, 'num_tokens': 225700007.0, 'completions/mean_length': 6002.6171875, 'completions/min_length': 1005.0, 'completions/max_length': 16296.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6002.6171875, 'completions/min_terminated_length': 1005.0, 'completions/max_terminated_length': 16296.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3400956988334656, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018406879156827927, 'sampling/sampling_logp_difference/max': 3.638152599334717, 'sampling/importance_sampling_ratio/min': 0.026300888508558273, 'sampling/importance_sampling_ratio/mean': 0.9999247193336487, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.4753725862610736e-05, 'epoch': 0.26}
+
+ 27%|██▋       | 281/1024 [11:43:08<32:06:53, 155.60s/it][AINFO 12-01 07:14:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:14:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:14:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:14:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 282/1024 [11:45:55<32:43:45, 158.80s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.003009645501151681, 'learning_rate': 1e-05, 'num_tokens': 226601835.0, 'completions/mean_length': 6905.84375, 'completions/min_length': 611.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6678.3681640625, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.35559535026550293, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018610220402479172, 'sampling/sampling_logp_difference/max': 3.010610818862915, 'sampling/importance_sampling_ratio/min': 0.049261581152677536, 'sampling/importance_sampling_ratio/mean': 1.0000262260437012, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.754121295671212e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 282/1024 [11:45:55<32:43:45, 158.80s/it][AINFO 12-01 07:17:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:17:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:17:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:17:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 283/1024 [11:48:34<32:42:53, 158.94s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017163599841296673, 'learning_rate': 1e-05, 'num_tokens': 227587919.0, 'completions/mean_length': 7548.40625, 'completions/min_length': 800.0, 'completions/max_length': 16170.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7548.40625, 'completions/min_terminated_length': 800.0, 'completions/max_terminated_length': 16170.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.15466687083244324, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.022034550085663795, 'sampling/sampling_logp_difference/max': 4.639448642730713, 'sampling/importance_sampling_ratio/min': 0.009663023985922337, 'sampling/importance_sampling_ratio/mean': 1.0000207424163818, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.435069879742514e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 283/1024 [11:48:34<32:42:53, 158.94s/it][AINFO 12-01 07:19:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:19:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:19:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:19:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 284/1024 [11:51:22<33:14:27, 161.71s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002833625301718712, 'learning_rate': 1e-05, 'num_tokens': 228597891.0, 'completions/mean_length': 7720.90625, 'completions/min_length': 127.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7512.9921875, 'completions/min_terminated_length': 127.0, 'completions/max_terminated_length': 15880.0, 'rewards/accuracy_reward/mean': 0.1953125, 'rewards/accuracy_reward/std': 0.3979988098144531, 'reward': 0.1953125, 'reward_std': 0.17806214094161987, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02092878520488739, 'sampling/sampling_logp_difference/max': 3.3467719554901123, 'sampling/importance_sampling_ratio/min': 0.03519779071211815, 'sampling/importance_sampling_ratio/mean': 0.9999651908874512, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.932936524757679e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 284/1024 [11:51:22<33:14:27, 161.71s/it][AINFO 12-01 07:22:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:22:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:22:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:22:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 285/1024 [11:53:48<32:12:33, 156.91s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0022446129005402327, 'learning_rate': 1e-05, 'num_tokens': 229491169.0, 'completions/mean_length': 6823.296875, 'completions/min_length': 656.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6748.015625, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 15279.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.32849058508872986, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018926549702882767, 'sampling/sampling_logp_difference/max': 2.273702621459961, 'sampling/importance_sampling_ratio/min': 0.10293036699295044, 'sampling/importance_sampling_ratio/mean': 0.9999606609344482, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.747588750182331e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 285/1024 [11:53:48<32:12:33, 156.91s/it][AINFO 12-01 07:25:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:25:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:25:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:25:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 286/1024 [11:56:21<31:57:48, 155.92s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009228324051946402, 'learning_rate': 1e-05, 'num_tokens': 230318886.0, 'completions/mean_length': 6326.1640625, 'completions/min_length': 482.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6084.7763671875, 'completions/min_terminated_length': 482.0, 'completions/max_terminated_length': 15591.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.24541424214839935, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020324576646089554, 'sampling/sampling_logp_difference/max': 6.085868835449219, 'sampling/importance_sampling_ratio/min': 0.002274787053465843, 'sampling/importance_sampling_ratio/mean': 1.0000520944595337, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.999864700039325e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 286/1024 [11:56:21<31:57:48, 155.92s/it][AINFO 12-01 07:27:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:27:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:27:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:27:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 287/1024 [11:58:46<31:15:11, 152.66s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0028006720822304487, 'learning_rate': 1e-05, 'num_tokens': 231075781.0, 'completions/mean_length': 5761.2421875, 'completions/min_length': 349.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5592.62744140625, 'completions/min_terminated_length': 349.0, 'completions/max_terminated_length': 16337.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018558964133262634, 'sampling/sampling_logp_difference/max': 4.531863689422607, 'sampling/importance_sampling_ratio/min': 0.010760603472590446, 'sampling/importance_sampling_ratio/mean': 1.0000426769256592, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.929734900040785e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 287/1024 [11:58:46<31:15:11, 152.66s/it][AINFO 12-01 07:30:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:30:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:30:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:30:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 288/1024 [12:01:36<32:16:57, 157.90s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001862869132310152, 'learning_rate': 1e-05, 'num_tokens': 232025320.0, 'completions/mean_length': 7285.1484375, 'completions/min_length': 1664.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7213.50390625, 'completions/min_terminated_length': 1664.0, 'completions/max_terminated_length': 15745.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.22225633263587952, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018633317202329636, 'sampling/sampling_logp_difference/max': 2.734583854675293, 'sampling/importance_sampling_ratio/min': 0.06492101401090622, 'sampling/importance_sampling_ratio/mean': 0.9999925494194031, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7068779849723796e-05, 'epoch': 0.26}
+
+ 28%|██▊       | 288/1024 [12:01:36<32:16:57, 157.90s/it][AINFO 12-01 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 289/1024 [12:04:13<32:07:28, 157.34s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0048000505194067955, 'learning_rate': 1e-05, 'num_tokens': 232792575.0, 'completions/mean_length': 5842.3046875, 'completions/min_length': 518.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5674.9765625, 'completions/min_terminated_length': 518.0, 'completions/max_terminated_length': 16213.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01878739893436432, 'sampling/sampling_logp_difference/max': 4.363770484924316, 'sampling/importance_sampling_ratio/min': 0.01273029763251543, 'sampling/importance_sampling_ratio/mean': 1.0000146627426147, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.525291410573118e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 289/1024 [12:04:13<32:07:28, 157.34s/it][AINFO 12-01 07:35:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:35:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:35:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:35:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 290/1024 [12:07:10<33:20:06, 163.50s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018108106451109052, 'learning_rate': 1e-05, 'num_tokens': 233791053.0, 'completions/mean_length': 7635.359375, 'completions/min_length': 857.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7496.49267578125, 'completions/min_terminated_length': 857.0, 'completions/max_terminated_length': 16066.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3216509222984314, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020851165056228638, 'sampling/sampling_logp_difference/max': 3.42467999458313, 'sampling/importance_sampling_ratio/min': 0.03255969658493996, 'sampling/importance_sampling_ratio/mean': 1.0000097751617432, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.392992946122831e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 290/1024 [12:07:10<33:20:06, 163.50s/it][AINFO 12-01 07:38:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:38:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:38:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:38:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 28%|██▊       | 291/1024 [12:09:57<33:27:47, 164.35s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020610105711966753, 'learning_rate': 1e-05, 'num_tokens': 234722261.0, 'completions/mean_length': 7132.75, 'completions/min_length': 874.0, 'completions/max_length': 16369.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7132.75, 'completions/min_terminated_length': 874.0, 'completions/max_terminated_length': 16369.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019047319889068604, 'sampling/sampling_logp_difference/max': 2.9250595569610596, 'sampling/importance_sampling_ratio/min': 0.05366149917244911, 'sampling/importance_sampling_ratio/mean': 0.999975860118866, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.914095527259633e-05, 'epoch': 0.27}
+
+ 28%|██▊       | 291/1024 [12:09:57<33:27:47, 164.35s/it][AINFO 12-01 07:41:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:41:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:41:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:41:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▊       | 292/1024 [12:12:34<32:58:29, 162.17s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0014700709143653512, 'learning_rate': 1e-05, 'num_tokens': 235606919.0, 'completions/mean_length': 6761.015625, 'completions/min_length': 696.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6450.5966796875, 'completions/min_terminated_length': 696.0, 'completions/max_terminated_length': 16154.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2212003916501999, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020191647112369537, 'sampling/sampling_logp_difference/max': 3.3569753170013428, 'sampling/importance_sampling_ratio/min': 0.034840479493141174, 'sampling/importance_sampling_ratio/mean': 0.9999991655349731, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.653430794907763e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 292/1024 [12:12:34<32:58:29, 162.17s/it][AINFO 12-01 07:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:43:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▊       | 293/1024 [12:15:15<32:51:46, 161.84s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0024776633363217115, 'learning_rate': 1e-05, 'num_tokens': 236443671.0, 'completions/mean_length': 6385.0625, 'completions/min_length': 602.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5893.31103515625, 'completions/min_terminated_length': 602.0, 'completions/max_terminated_length': 16257.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.38505613803863525, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.022407006472349167, 'sampling/sampling_logp_difference/max': 3.2356367111206055, 'sampling/importance_sampling_ratio/min': 0.03933515399694443, 'sampling/importance_sampling_ratio/mean': 0.9999498724937439, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.358046834975539e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 293/1024 [12:15:15<32:51:46, 161.84s/it][AINFO 12-01 07:46:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:46:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:46:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:46:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▊       | 294/1024 [12:17:39<31:44:59, 156.58s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0024750216398388147, 'learning_rate': 1e-05, 'num_tokens': 237172176.0, 'completions/mean_length': 5483.6953125, 'completions/min_length': 982.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5397.8662109375, 'completions/min_terminated_length': 982.0, 'completions/max_terminated_length': 15053.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.28247418999671936, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017862241715192795, 'sampling/sampling_logp_difference/max': 2.5103297233581543, 'sampling/importance_sampling_ratio/min': 0.08124144375324249, 'sampling/importance_sampling_ratio/mean': 1.0000691413879395, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.746416294030496e-05, 'epoch': 0.27}
+
+ 29%|██▊       | 294/1024 [12:17:39<31:44:59, 156.58s/it][AINFO 12-01 07:48:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:48:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:48:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:48:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 295/1024 [12:20:05<31:03:34, 153.38s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0014924908755347133, 'learning_rate': 1e-05, 'num_tokens': 238083774.0, 'completions/mean_length': 6973.671875, 'completions/min_length': 790.0, 'completions/max_length': 15310.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6973.671875, 'completions/min_terminated_length': 790.0, 'completions/max_terminated_length': 15310.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.2637920379638672, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020449679344892502, 'sampling/sampling_logp_difference/max': 3.205252170562744, 'sampling/importance_sampling_ratio/min': 0.04054867476224899, 'sampling/importance_sampling_ratio/mean': 1.0000556707382202, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.311646614496567e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 295/1024 [12:20:05<31:03:34, 153.38s/it][AINFO 12-01 07:51:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:51:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:51:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:51:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 296/1024 [12:23:03<32:30:02, 160.72s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014906360302120447, 'learning_rate': 1e-05, 'num_tokens': 239187932.0, 'completions/mean_length': 8447.984375, 'completions/min_length': 1086.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7988.87548828125, 'completions/min_terminated_length': 1086.0, 'completions/max_terminated_length': 16362.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.21488474309444427, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02123129740357399, 'sampling/sampling_logp_difference/max': 3.886500358581543, 'sampling/importance_sampling_ratio/min': 0.020517023280262947, 'sampling/importance_sampling_ratio/mean': 0.9999756813049316, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.88087567241746e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 296/1024 [12:23:03<32:30:02, 160.72s/it][AINFO 12-01 07:54:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:54:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:54:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:54:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 297/1024 [12:25:35<31:55:44, 158.11s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019620165694504976, 'learning_rate': 1e-05, 'num_tokens': 239999620.0, 'completions/mean_length': 6195.5, 'completions/min_length': 495.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6115.275390625, 'completions/min_terminated_length': 495.0, 'completions/max_terminated_length': 16058.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018867846578359604, 'sampling/sampling_logp_difference/max': 4.114330291748047, 'sampling/importance_sampling_ratio/min': 0.016336876899003983, 'sampling/importance_sampling_ratio/mean': 0.9999954700469971, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.42112392420313e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 297/1024 [12:25:35<31:55:44, 158.11s/it][AINFO 12-01 07:56:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:56:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:56:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:56:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 298/1024 [12:27:43<30:04:28, 149.13s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0025716947857290506, 'learning_rate': 1e-05, 'num_tokens': 240793396.0, 'completions/mean_length': 6044.0625, 'completions/min_length': 1044.0, 'completions/max_length': 15112.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6044.0625, 'completions/min_terminated_length': 1044.0, 'completions/max_terminated_length': 15112.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3498311936855316, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018796749413013458, 'sampling/sampling_logp_difference/max': 3.0958495140075684, 'sampling/importance_sampling_ratio/min': 0.04523656889796257, 'sampling/importance_sampling_ratio/mean': 1.0000165700912476, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.365757144521922e-05, 'epoch': 0.27}
+
+ 29%|██▉       | 298/1024 [12:27:43<30:04:28, 149.13s/it][AINFO 12-01 07:59:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:59:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:59:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 07:59:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 299/1024 [12:30:22<30:37:29, 152.07s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001186411245726049, 'learning_rate': 1e-05, 'num_tokens': 241779842.0, 'completions/mean_length': 7543.484375, 'completions/min_length': 1014.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7258.30615234375, 'completions/min_terminated_length': 1014.0, 'completions/max_terminated_length': 16088.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021575307473540306, 'sampling/sampling_logp_difference/max': 1.9793213605880737, 'sampling/importance_sampling_ratio/min': 0.13816297054290771, 'sampling/importance_sampling_ratio/mean': 1.0000061988830566, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5726153530267766e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 299/1024 [12:30:22<30:37:29, 152.07s/it][AINFO 12-01 08:01:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:01:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:01:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:01:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 300/1024 [12:33:09<31:29:24, 156.58s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015082844765856862, 'learning_rate': 1e-05, 'num_tokens': 242735801.0, 'completions/mean_length': 7334.1796875, 'completions/min_length': 1286.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7116.984375, 'completions/min_terminated_length': 1286.0, 'completions/max_terminated_length': 16381.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018372444435954094, 'sampling/sampling_logp_difference/max': 3.901068687438965, 'sampling/importance_sampling_ratio/min': 0.02022029086947441, 'sampling/importance_sampling_ratio/mean': 1.0000557899475098, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.0792581987479934e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 300/1024 [12:33:09<31:29:24, 156.58s/it][AINFO 12-01 08:04:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:04:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:04:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:04:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 301/1024 [12:35:37<30:56:17, 154.05s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019596186466515064, 'learning_rate': 1e-05, 'num_tokens': 243611695.0, 'completions/mean_length': 6712.546875, 'completions/min_length': 934.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6559.0322265625, 'completions/min_terminated_length': 934.0, 'completions/max_terminated_length': 15319.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2959064245223999, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019380375742912292, 'sampling/sampling_logp_difference/max': 8.307388305664062, 'sampling/importance_sampling_ratio/min': 0.0002466874720994383, 'sampling/importance_sampling_ratio/mean': 1.000008463859558, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.011420439084759e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 301/1024 [12:35:37<30:56:17, 154.05s/it][AINFO 12-01 08:06:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:06:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:06:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:06:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 29%|██▉       | 302/1024 [12:38:21<31:29:09, 156.99s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.004561781417578459, 'learning_rate': 1e-05, 'num_tokens': 244377224.0, 'completions/mean_length': 5817.3828125, 'completions/min_length': 645.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5387.84521484375, 'completions/min_terminated_length': 645.0, 'completions/max_terminated_length': 16215.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.29985812306404114, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019442548975348473, 'sampling/sampling_logp_difference/max': 3.428119659423828, 'sampling/importance_sampling_ratio/min': 0.032447896897792816, 'sampling/importance_sampling_ratio/mean': 0.99995356798172, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.015517405126957e-05, 'epoch': 0.28}
+
+ 29%|██▉       | 302/1024 [12:38:21<31:29:09, 156.99s/it][AINFO 12-01 08:09:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:09:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:09:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:09:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|██▉       | 303/1024 [12:41:21<32:50:21, 163.97s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002071492839604616, 'learning_rate': 1e-05, 'num_tokens': 245454521.0, 'completions/mean_length': 8269.8203125, 'completions/min_length': 922.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7728.87548828125, 'completions/min_terminated_length': 922.0, 'completions/max_terminated_length': 16019.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.32012516260147095, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021464960649609566, 'sampling/sampling_logp_difference/max': 4.488455772399902, 'sampling/importance_sampling_ratio/min': 0.011237984523177147, 'sampling/importance_sampling_ratio/mean': 1.0000483989715576, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.236706306277483e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 303/1024 [12:41:21<32:50:21, 163.97s/it][AINFO 12-01 08:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:12:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:12:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|██▉       | 304/1024 [12:44:22<33:47:31, 168.96s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022035713773220778, 'learning_rate': 1e-05, 'num_tokens': 246378889.0, 'completions/mean_length': 7077.25, 'completions/min_length': 629.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6288.54248046875, 'completions/min_terminated_length': 629.0, 'completions/max_terminated_length': 16165.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.18253791332244873, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019167516380548477, 'sampling/sampling_logp_difference/max': 3.1993393898010254, 'sampling/importance_sampling_ratio/min': 0.04078913852572441, 'sampling/importance_sampling_ratio/mean': 1.0000531673431396, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.422927013867593e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 304/1024 [12:44:22<33:47:31, 168.96s/it][AINFO 12-01 08:15:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:15:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:15:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:15:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|██▉       | 305/1024 [12:46:53<32:41:53, 163.72s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002328289905562997, 'learning_rate': 1e-05, 'num_tokens': 247225374.0, 'completions/mean_length': 6464.7890625, 'completions/min_length': 675.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6307.341796875, 'completions/min_terminated_length': 675.0, 'completions/max_terminated_length': 15410.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.32325342297554016, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019406337291002274, 'sampling/sampling_logp_difference/max': 3.116650104522705, 'sampling/importance_sampling_ratio/min': 0.04430533945560455, 'sampling/importance_sampling_ratio/mean': 1.000032901763916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.213217420736328e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 305/1024 [12:46:53<32:41:53, 163.72s/it][AINFO 12-01 08:18:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:18:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:18:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:18:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|██▉       | 306/1024 [12:49:56<33:45:24, 169.25s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001942771370522678, 'learning_rate': 1e-05, 'num_tokens': 248339377.0, 'completions/mean_length': 8569.8984375, 'completions/min_length': 1042.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8317.830078125, 'completions/min_terminated_length': 1042.0, 'completions/max_terminated_length': 15906.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.20753081142902374, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021387826651334763, 'sampling/sampling_logp_difference/max': 2.7862892150878906, 'sampling/importance_sampling_ratio/min': 0.06164956092834473, 'sampling/importance_sampling_ratio/mean': 0.9999514222145081, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.42878684907555e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 306/1024 [12:49:56<33:45:24, 169.25s/it][AINFO 12-01 08:21:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:21:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:21:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:21:12 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|██▉       | 307/1024 [12:52:16<31:59:38, 160.64s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003485191846266389, 'learning_rate': 1e-05, 'num_tokens': 249170277.0, 'completions/mean_length': 6316.53125, 'completions/min_length': 1033.0, 'completions/max_length': 15722.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6316.53125, 'completions/min_terminated_length': 1033.0, 'completions/max_terminated_length': 15722.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2359209954738617, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02164064347743988, 'sampling/sampling_logp_difference/max': 5.736531734466553, 'sampling/importance_sampling_ratio/min': 0.0032259372528642416, 'sampling/importance_sampling_ratio/mean': 1.0000240802764893, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.168116001006638e-05, 'epoch': 0.28}
+
+ 30%|██▉       | 307/1024 [12:52:16<31:59:38, 160.64s/it][AINFO 12-01 08:23:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:23:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:23:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:23:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|███       | 308/1024 [12:55:09<32:40:50, 164.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0030411493498831987, 'learning_rate': 1e-05, 'num_tokens': 250066346.0, 'completions/mean_length': 6872.1640625, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6797.267578125, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15566.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3243093490600586, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020623864606022835, 'sampling/sampling_logp_difference/max': 2.6427369117736816, 'sampling/importance_sampling_ratio/min': 0.07116623222827911, 'sampling/importance_sampling_ratio/mean': 0.9999606609344482, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7881811269689933e-05, 'epoch': 0.28}
+
+ 30%|███       | 308/1024 [12:55:09<32:40:50, 164.32s/it][AINFO 12-01 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:26:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|███       | 309/1024 [12:57:44<32:05:26, 161.58s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014674448175355792, 'learning_rate': 1e-05, 'num_tokens': 250870607.0, 'completions/mean_length': 6128.7890625, 'completions/min_length': 1091.0, 'completions/max_length': 16207.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6128.7890625, 'completions/min_terminated_length': 1091.0, 'completions/max_terminated_length': 16207.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.327729195356369, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0169319286942482, 'sampling/sampling_logp_difference/max': 1.5500338077545166, 'sampling/importance_sampling_ratio/min': 0.2122407853603363, 'sampling/importance_sampling_ratio/mean': 1.000065803527832, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.55451611035096e-05, 'epoch': 0.28}
+
+ 30%|███       | 309/1024 [12:57:44<32:05:26, 161.58s/it][AINFO 12-01 08:29:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:29:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:29:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:29:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|███       | 310/1024 [13:00:32<32:25:59, 163.53s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015806586015969515, 'learning_rate': 1e-05, 'num_tokens': 251874138.0, 'completions/mean_length': 7683.4609375, 'completions/min_length': 1371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7545.357421875, 'completions/min_terminated_length': 1371.0, 'completions/max_terminated_length': 16175.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2227931022644043, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018930276855826378, 'sampling/sampling_logp_difference/max': 4.03227424621582, 'sampling/importance_sampling_ratio/min': 0.017733952030539513, 'sampling/importance_sampling_ratio/mean': 0.9999857544898987, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.4088620193651877e-05, 'epoch': 0.29}
+
+ 30%|███       | 310/1024 [13:00:32<32:25:59, 163.53s/it][AINFO 12-01 08:31:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:31:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:31:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:31:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|███       | 311/1024 [13:02:54<31:05:40, 157.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017116712406277657, 'learning_rate': 1e-05, 'num_tokens': 252630583.0, 'completions/mean_length': 5767.1640625, 'completions/min_length': 320.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5683.56689453125, 'completions/min_terminated_length': 320.0, 'completions/max_terminated_length': 16323.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2301519513130188, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018449850380420685, 'sampling/sampling_logp_difference/max': 2.694835662841797, 'sampling/importance_sampling_ratio/min': 0.06755348294973373, 'sampling/importance_sampling_ratio/mean': 1.000015377998352, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.654553620435763e-05, 'epoch': 0.29}
+
+ 30%|███       | 311/1024 [13:02:54<31:05:40, 157.00s/it][AINFO 12-01 08:34:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:34:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:34:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:34:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 30%|███       | 312/1024 [13:05:52<32:18:00, 163.32s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0018667849944904447, 'learning_rate': 1e-05, 'num_tokens': 253539491.0, 'completions/mean_length': 6958.96875, 'completions/min_length': 791.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6884.755859375, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 16127.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020706312730908394, 'sampling/sampling_logp_difference/max': 2.3931198120117188, 'sampling/importance_sampling_ratio/min': 0.09134425967931747, 'sampling/importance_sampling_ratio/mean': 1.0000157356262207, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.8975991830957355e-05, 'epoch': 0.29}
+
+ 30%|███       | 312/1024 [13:05:52<32:18:00, 163.32s/it][AINFO 12-01 08:37:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:37:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:37:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:37:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███       | 313/1024 [13:08:53<33:16:35, 168.49s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001921940827742219, 'learning_rate': 1e-05, 'num_tokens': 254606636.0, 'completions/mean_length': 8174.1953125, 'completions/min_length': 825.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7909.36279296875, 'completions/min_terminated_length': 825.0, 'completions/max_terminated_length': 16014.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.31930166482925415, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020522577688097954, 'sampling/sampling_logp_difference/max': 4.6560444831848145, 'sampling/importance_sampling_ratio/min': 0.009503981098532677, 'sampling/importance_sampling_ratio/mean': 0.9999836683273315, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.479764897449058e-05, 'epoch': 0.29}
+
+ 31%|███       | 313/1024 [13:08:53<33:16:35, 168.49s/it][AINFO 12-01 08:40:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:40:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:40:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:40:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███       | 314/1024 [13:11:31<32:37:39, 165.44s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0021366209257394075, 'learning_rate': 1e-05, 'num_tokens': 255445682.0, 'completions/mean_length': 6394.796875, 'completions/min_length': 804.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6236.23828125, 'completions/min_terminated_length': 804.0, 'completions/max_terminated_length': 15247.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019587717950344086, 'sampling/sampling_logp_difference/max': 2.848909378051758, 'sampling/importance_sampling_ratio/min': 0.05790744349360466, 'sampling/importance_sampling_ratio/mean': 1.000028133392334, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.775157799485896e-05, 'epoch': 0.29}
+
+ 31%|███       | 314/1024 [13:11:31<32:37:39, 165.44s/it][AINFO 12-01 08:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:42:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███       | 315/1024 [13:14:37<33:46:54, 171.53s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021615088917315006, 'learning_rate': 1e-05, 'num_tokens': 256442795.0, 'completions/mean_length': 7569.3828125, 'completions/min_length': 1021.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7211.06494140625, 'completions/min_terminated_length': 1021.0, 'completions/max_terminated_length': 16273.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02075791358947754, 'sampling/sampling_logp_difference/max': 1.7693898677825928, 'sampling/importance_sampling_ratio/min': 0.17043694853782654, 'sampling/importance_sampling_ratio/mean': 1.000032901763916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1550071248129825e-05, 'epoch': 0.29}
+
+ 31%|███       | 315/1024 [13:14:37<33:46:54, 171.53s/it][AINFO 12-01 08:45:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:45:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:45:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:45:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███       | 316/1024 [13:17:50<35:01:54, 178.13s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015748273581266403, 'learning_rate': 1e-05, 'num_tokens': 257583548.0, 'completions/mean_length': 8782.6953125, 'completions/min_length': 1394.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8275.9423828125, 'completions/min_terminated_length': 1394.0, 'completions/max_terminated_length': 15905.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.321418434381485, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01987791806459427, 'sampling/sampling_logp_difference/max': 4.481658935546875, 'sampling/importance_sampling_ratio/min': 0.011314627714455128, 'sampling/importance_sampling_ratio/mean': 0.9999714493751526, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.705221929019899e-05, 'epoch': 0.29}
+
+ 31%|███       | 316/1024 [13:17:50<35:01:54, 178.13s/it][AINFO 12-01 08:49:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:49:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:49:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:49:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███       | 317/1024 [13:20:54<35:18:32, 179.79s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011800741776823997, 'learning_rate': 1e-05, 'num_tokens': 258508007.0, 'completions/mean_length': 7076.3984375, 'completions/min_length': 898.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6853.01611328125, 'completions/min_terminated_length': 898.0, 'completions/max_terminated_length': 16133.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.24777325987815857, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018987935036420822, 'sampling/sampling_logp_difference/max': 3.756122589111328, 'sampling/importance_sampling_ratio/min': 0.023374196141958237, 'sampling/importance_sampling_ratio/mean': 1.000013828277588, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6324710638145916e-05, 'epoch': 0.29}
+
+ 31%|███       | 317/1024 [13:20:54<35:18:32, 179.79s/it][AINFO 12-01 08:52:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:52:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:52:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:52:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███       | 318/1024 [13:23:00<32:06:01, 163.68s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001389914657920599, 'learning_rate': 1e-05, 'num_tokens': 259317464.0, 'completions/mean_length': 6181.8828125, 'completions/min_length': 1008.0, 'completions/max_length': 13500.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6181.8828125, 'completions/min_terminated_length': 1008.0, 'completions/max_terminated_length': 13500.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.3316681683063507, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019156958907842636, 'sampling/sampling_logp_difference/max': 2.9197587966918945, 'sampling/importance_sampling_ratio/min': 0.05394669994711876, 'sampling/importance_sampling_ratio/mean': 1.0000722408294678, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6817589748116006e-05, 'epoch': 0.29}
+
+ 31%|███       | 318/1024 [13:23:00<32:06:01, 163.68s/it][AINFO 12-01 08:54:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:54:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:54:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:54:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███       | 319/1024 [13:25:47<32:14:22, 164.63s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013155591441318393, 'learning_rate': 1e-05, 'num_tokens': 260283118.0, 'completions/mean_length': 7396.109375, 'completions/min_length': 1123.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7180.400390625, 'completions/min_terminated_length': 1123.0, 'completions/max_terminated_length': 15893.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01900000311434269, 'sampling/sampling_logp_difference/max': 1.4225291013717651, 'sampling/importance_sampling_ratio/min': 0.24110348522663116, 'sampling/importance_sampling_ratio/mean': 1.0000684261322021, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.281407487065735e-05, 'epoch': 0.29}
+
+ 31%|███       | 319/1024 [13:25:47<32:14:22, 164.63s/it][AINFO 12-01 08:57:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:57:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:57:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:57:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███▏      | 320/1024 [13:28:25<31:47:14, 162.55s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0017647831700742245, 'learning_rate': 1e-05, 'num_tokens': 261181622.0, 'completions/mean_length': 6872.625, 'completions/min_length': 852.0, 'completions/max_length': 14945.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6872.625, 'completions/min_terminated_length': 852.0, 'completions/max_terminated_length': 14945.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.27274850010871887, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02079041115939617, 'sampling/sampling_logp_difference/max': 1.7069451808929443, 'sampling/importance_sampling_ratio/min': 0.18141914904117584, 'sampling/importance_sampling_ratio/mean': 0.9999293684959412, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.342405074770795e-05, 'epoch': 0.29}
+
+ 31%|███▏      | 320/1024 [13:28:25<31:47:14, 162.55s/it][AINFO 12-01 08:59:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:59:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:59:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 08:59:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███▏      | 321/1024 [13:30:48<30:37:51, 156.86s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002655226970091462, 'learning_rate': 1e-05, 'num_tokens': 261940189.0, 'completions/mean_length': 5787.3046875, 'completions/min_length': 907.0, 'completions/max_length': 15325.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5787.3046875, 'completions/min_terminated_length': 907.0, 'completions/max_terminated_length': 15325.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.26013973355293274, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01960219070315361, 'sampling/sampling_logp_difference/max': 3.135334014892578, 'sampling/importance_sampling_ratio/min': 0.21341054141521454, 'sampling/importance_sampling_ratio/mean': 1.000058889389038, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1334387674396567e-05, 'epoch': 0.3}
+
+ 31%|███▏      | 321/1024 [13:30:48<30:37:51, 156.86s/it][AINFO 12-01 09:02:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:02:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:02:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:02:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 31%|███▏      | 322/1024 [13:33:47<31:50:30, 163.29s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021876750979572535, 'learning_rate': 1e-05, 'num_tokens': 262860298.0, 'completions/mean_length': 7039.4765625, 'completions/min_length': 434.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6815.20849609375, 'completions/min_terminated_length': 434.0, 'completions/max_terminated_length': 15497.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2987973093986511, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02021763287484646, 'sampling/sampling_logp_difference/max': 2.6460161209106445, 'sampling/importance_sampling_ratio/min': 0.07093323767185211, 'sampling/importance_sampling_ratio/mean': 1.0000483989715576, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.232973265061446e-05, 'epoch': 0.3}
+
+ 31%|███▏      | 322/1024 [13:33:47<31:50:30, 163.29s/it][AINFO 12-01 09:05:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:05:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:05:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:05:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 323/1024 [13:36:15<30:56:55, 158.94s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015622344799339771, 'learning_rate': 1e-05, 'num_tokens': 263686513.0, 'completions/mean_length': 6296.6796875, 'completions/min_length': 1224.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6054.58447265625, 'completions/min_terminated_length': 1224.0, 'completions/max_terminated_length': 16024.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.2238539308309555, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019766859710216522, 'sampling/sampling_logp_difference/max': 3.931058883666992, 'sampling/importance_sampling_ratio/min': 0.019622882828116417, 'sampling/importance_sampling_ratio/mean': 1.0000685453414917, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.548513584268221e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 323/1024 [13:36:15<30:56:55, 158.94s/it][AINFO 12-01 09:07:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:07:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:07:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:07:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 324/1024 [13:39:15<32:06:58, 165.17s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013804023619741201, 'learning_rate': 1e-05, 'num_tokens': 264637477.0, 'completions/mean_length': 7288.84375, 'completions/min_length': 509.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7070.560546875, 'completions/min_terminated_length': 509.0, 'completions/max_terminated_length': 15714.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0190635547041893, 'sampling/sampling_logp_difference/max': 5.801871299743652, 'sampling/importance_sampling_ratio/min': 0.003021894721314311, 'sampling/importance_sampling_ratio/mean': 1.0000410079956055, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6208270507340785e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 324/1024 [13:39:15<32:06:58, 165.17s/it][AINFO 12-01 09:10:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:10:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:10:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:10:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 325/1024 [13:42:12<32:44:37, 168.64s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018749439623206854, 'learning_rate': 1e-05, 'num_tokens': 265704093.0, 'completions/mean_length': 8199.5, 'completions/min_length': 522.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7935.4833984375, 'completions/min_terminated_length': 522.0, 'completions/max_terminated_length': 16012.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3379838466644287, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020366985350847244, 'sampling/sampling_logp_difference/max': 5.7469635009765625, 'sampling/importance_sampling_ratio/min': 0.0031924599315971136, 'sampling/importance_sampling_ratio/mean': 0.9999596476554871, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.252074535950669e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 325/1024 [13:42:12<32:44:37, 168.64s/it][AINFO 12-01 09:13:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:13:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:13:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:13:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 326/1024 [13:45:07<33:04:19, 170.57s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0026626272592693567, 'learning_rate': 1e-05, 'num_tokens': 266705465.0, 'completions/mean_length': 7689.46875, 'completions/min_length': 981.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7186.47900390625, 'completions/min_terminated_length': 981.0, 'completions/max_terminated_length': 15924.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2120065689086914, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018800940364599228, 'sampling/sampling_logp_difference/max': 2.9905130863189697, 'sampling/importance_sampling_ratio/min': 0.050261642783880234, 'sampling/importance_sampling_ratio/mean': 0.9999584555625916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.621914763447421e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 326/1024 [13:45:07<33:04:19, 170.57s/it][AINFO 12-01 09:16:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:16:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:16:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:16:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 327/1024 [13:48:06<33:31:58, 173.20s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002345885382965207, 'learning_rate': 1e-05, 'num_tokens': 267655072.0, 'completions/mean_length': 7266.0546875, 'completions/min_length': 288.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6738.56982421875, 'completions/min_terminated_length': 288.0, 'completions/max_terminated_length': 16091.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.28801077604293823, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01958080753684044, 'sampling/sampling_logp_difference/max': 2.34867000579834, 'sampling/importance_sampling_ratio/min': 0.09549608826637268, 'sampling/importance_sampling_ratio/mean': 0.9999850392341614, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.004926199741021e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 327/1024 [13:48:06<33:31:58, 173.20s/it][AINFO 12-01 09:19:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:19:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:19:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:19:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 328/1024 [13:51:03<33:40:50, 174.21s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017619420541450381, 'learning_rate': 1e-05, 'num_tokens': 268577158.0, 'completions/mean_length': 7060.359375, 'completions/min_length': 872.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6986.94482421875, 'completions/min_terminated_length': 872.0, 'completions/max_terminated_length': 16298.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.26409149169921875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021649383008480072, 'sampling/sampling_logp_difference/max': 4.620420455932617, 'sampling/importance_sampling_ratio/min': 0.00984865427017212, 'sampling/importance_sampling_ratio/mean': 0.9999632835388184, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.417719128468889e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 328/1024 [13:51:03<33:40:50, 174.21s/it][AINFO 12-01 09:22:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:22:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:22:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:22:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 329/1024 [13:53:33<32:14:15, 166.99s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002009752904996276, 'learning_rate': 1e-05, 'num_tokens': 269511823.0, 'completions/mean_length': 7137.1328125, 'completions/min_length': 259.0, 'completions/max_length': 15170.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7137.1328125, 'completions/min_terminated_length': 259.0, 'completions/max_terminated_length': 15170.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29249146580696106, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02042423188686371, 'sampling/sampling_logp_difference/max': 2.2248330116271973, 'sampling/importance_sampling_ratio/min': 0.10808546841144562, 'sampling/importance_sampling_ratio/mean': 0.9999421238899231, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.542470696833334e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 329/1024 [13:53:33<32:14:15, 166.99s/it][AINFO 12-01 09:24:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:24:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:24:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:24:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 330/1024 [13:55:50<30:28:58, 158.12s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002587186871096492, 'learning_rate': 1e-05, 'num_tokens': 270248447.0, 'completions/mean_length': 5612.625, 'completions/min_length': 1360.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5527.81103515625, 'completions/min_terminated_length': 1360.0, 'completions/max_terminated_length': 13119.0, 'rewards/accuracy_reward/mean': 0.6796875, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.6796875, 'reward_std': 0.20357418060302734, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01811870187520981, 'sampling/sampling_logp_difference/max': 2.7749152183532715, 'sampling/importance_sampling_ratio/min': 0.062354762107133865, 'sampling/importance_sampling_ratio/mean': 0.9999539852142334, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7652445143976365e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 330/1024 [13:55:50<30:28:58, 158.12s/it][AINFO 12-01 09:27:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:27:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:27:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:27:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 331/1024 [13:58:16<29:44:44, 154.52s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0027542049065232277, 'learning_rate': 1e-05, 'num_tokens': 271091069.0, 'completions/mean_length': 6427.484375, 'completions/min_length': 799.0, 'completions/max_length': 15945.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6427.484375, 'completions/min_terminated_length': 799.0, 'completions/max_terminated_length': 15945.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.22331714630126953, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017686372622847557, 'sampling/sampling_logp_difference/max': 2.188302516937256, 'sampling/importance_sampling_ratio/min': 0.11210688948631287, 'sampling/importance_sampling_ratio/mean': 1.0000110864639282, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.3940712480907678e-05, 'epoch': 0.3}
+
+ 32%|███▏      | 331/1024 [13:58:16<29:44:44, 154.52s/it][AINFO 12-01 09:29:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:29:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:29:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:29:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 32%|███▏      | 332/1024 [14:00:58<30:06:01, 156.59s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002394787734374404, 'learning_rate': 1e-05, 'num_tokens': 271941859.0, 'completions/mean_length': 6478.359375, 'completions/min_length': 570.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6158.822265625, 'completions/min_terminated_length': 570.0, 'completions/max_terminated_length': 15561.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2937847375869751, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019578102976083755, 'sampling/sampling_logp_difference/max': 7.05142068862915, 'sampling/importance_sampling_ratio/min': 0.0008661774918437004, 'sampling/importance_sampling_ratio/mean': 0.9999170303344727, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.194113341782213e-05, 'epoch': 0.31}
+
+ 32%|███▏      | 332/1024 [14:00:58<30:06:01, 156.59s/it][AINFO 12-01 09:32:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:32:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:32:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:32:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 333/1024 [14:03:24<29:28:22, 153.55s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0030469561461359262, 'learning_rate': 1e-05, 'num_tokens': 272813980.0, 'completions/mean_length': 6662.7578125, 'completions/min_length': 397.0, 'completions/max_length': 15704.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6662.7578125, 'completions/min_terminated_length': 397.0, 'completions/max_terminated_length': 15704.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.25620076060295105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022478599101305008, 'sampling/sampling_logp_difference/max': 4.484858512878418, 'sampling/importance_sampling_ratio/min': 0.011278483085334301, 'sampling/importance_sampling_ratio/mean': 1.000074863433838, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.65463181803716e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 333/1024 [14:03:24<29:28:22, 153.55s/it][AINFO 12-01 09:34:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:34:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:34:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:34:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 334/1024 [14:06:17<30:32:39, 159.36s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0027610845863819122, 'learning_rate': 1e-05, 'num_tokens': 273674610.0, 'completions/mean_length': 6565.171875, 'completions/min_length': 935.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6409.31787109375, 'completions/min_terminated_length': 935.0, 'completions/max_terminated_length': 16369.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0197727270424366, 'sampling/sampling_logp_difference/max': 3.211190700531006, 'sampling/importance_sampling_ratio/min': 0.04030859097838402, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.192316407625185e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 334/1024 [14:06:17<30:32:39, 159.36s/it][AINFO 12-01 09:37:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:37:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:37:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:37:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 335/1024 [14:09:12<31:21:29, 163.85s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0012756467331200838, 'learning_rate': 1e-05, 'num_tokens': 274723017.0, 'completions/mean_length': 8038.4296875, 'completions/min_length': 932.0, 'completions/max_length': 16160.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 8038.4296875, 'completions/min_terminated_length': 932.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2975040376186371, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.022875797003507614, 'sampling/sampling_logp_difference/max': 2.623220682144165, 'sampling/importance_sampling_ratio/min': 0.07256876677274704, 'sampling/importance_sampling_ratio/mean': 0.9999386072158813, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.140634673101886e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 335/1024 [14:09:12<31:21:29, 163.85s/it][AINFO 12-01 09:40:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:40:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:40:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:40:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 336/1024 [14:11:30<29:51:10, 156.21s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012538193259388208, 'learning_rate': 1e-05, 'num_tokens': 275570080.0, 'completions/mean_length': 6470.1171875, 'completions/min_length': 1061.0, 'completions/max_length': 15187.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6470.1171875, 'completions/min_terminated_length': 1061.0, 'completions/max_terminated_length': 15187.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.18884865939617157, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020740266889333725, 'sampling/sampling_logp_difference/max': 3.462751865386963, 'sampling/importance_sampling_ratio/min': 0.03134339302778244, 'sampling/importance_sampling_ratio/mean': 0.9999693036079407, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7310850757421576e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 336/1024 [14:11:30<29:51:10, 156.21s/it][AINFO 12-01 09:42:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:42:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:42:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:42:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 337/1024 [14:14:53<32:30:41, 170.37s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018285427941009402, 'learning_rate': 1e-05, 'num_tokens': 276618553.0, 'completions/mean_length': 8009.4453125, 'completions/min_length': 958.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7524.966796875, 'completions/min_terminated_length': 958.0, 'completions/max_terminated_length': 16351.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.30327796936035156, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02183813974261284, 'sampling/sampling_logp_difference/max': 2.732706069946289, 'sampling/importance_sampling_ratio/min': 0.06504303961992264, 'sampling/importance_sampling_ratio/mean': 1.0000548362731934, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.275174448295729e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 337/1024 [14:14:53<32:30:41, 170.37s/it][AINFO 12-01 09:46:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:46:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:46:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:46:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 338/1024 [14:17:46<32:34:27, 170.94s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018328198930248618, 'learning_rate': 1e-05, 'num_tokens': 277557584.0, 'completions/mean_length': 7176.9296875, 'completions/min_length': 1520.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7030.7861328125, 'completions/min_terminated_length': 1520.0, 'completions/max_terminated_length': 16308.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.24275580048561096, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021012119948863983, 'sampling/sampling_logp_difference/max': 4.99385929107666, 'sampling/importance_sampling_ratio/min': 0.006779449991881847, 'sampling/importance_sampling_ratio/mean': 1.0000104904174805, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5136473772799945e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 338/1024 [14:17:46<32:34:27, 170.94s/it][AINFO 12-01 09:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:49:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 339/1024 [14:20:20<31:35:02, 165.99s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002455603564158082, 'learning_rate': 1e-05, 'num_tokens': 278294739.0, 'completions/mean_length': 5571.0859375, 'completions/min_length': 393.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5485.94482421875, 'completions/min_terminated_length': 393.0, 'completions/max_terminated_length': 15091.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.14465448260307312, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.019600313156843185, 'sampling/sampling_logp_difference/max': 2.2535505294799805, 'sampling/importance_sampling_ratio/min': 0.10502566397190094, 'sampling/importance_sampling_ratio/mean': 0.9999660849571228, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0111747946648393e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 339/1024 [14:20:20<31:35:02, 165.99s/it][AINFO 12-01 09:51:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:51:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:51:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:51:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 340/1024 [14:23:05<31:29:56, 165.78s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016635354841127992, 'learning_rate': 1e-05, 'num_tokens': 279290462.0, 'completions/mean_length': 7631.5859375, 'completions/min_length': 1226.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7421.5283203125, 'completions/min_terminated_length': 1226.0, 'completions/max_terminated_length': 15961.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.264615535736084, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020130585879087448, 'sampling/sampling_logp_difference/max': 5.351283550262451, 'sampling/importance_sampling_ratio/min': 0.004742060322314501, 'sampling/importance_sampling_ratio/mean': 1.000056505203247, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1618304976509535e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 340/1024 [14:23:05<31:29:56, 165.78s/it][AINFO 12-01 09:54:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:54:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:54:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:54:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 341/1024 [14:25:41<30:53:44, 162.85s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017422637902200222, 'learning_rate': 1e-05, 'num_tokens': 280246130.0, 'completions/mean_length': 7313.59375, 'completions/min_length': 792.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7169.61962890625, 'completions/min_terminated_length': 792.0, 'completions/max_terminated_length': 16277.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.34929439425468445, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020403482019901276, 'sampling/sampling_logp_difference/max': 2.8108975887298584, 'sampling/importance_sampling_ratio/min': 0.06015097722411156, 'sampling/importance_sampling_ratio/mean': 0.9999756217002869, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.950735152917332e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 341/1024 [14:25:41<30:53:44, 162.85s/it][AINFO 12-01 09:56:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:56:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:56:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 09:56:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 342/1024 [14:28:43<31:56:39, 168.62s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0014955326914787292, 'learning_rate': 1e-05, 'num_tokens': 281162068.0, 'completions/mean_length': 7019.640625, 'completions/min_length': 978.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6717.564453125, 'completions/min_terminated_length': 978.0, 'completions/max_terminated_length': 16247.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019378289580345154, 'sampling/sampling_logp_difference/max': 1.8125624656677246, 'sampling/importance_sampling_ratio/min': 0.16323530673980713, 'sampling/importance_sampling_ratio/mean': 0.9999910593032837, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4802831805791357e-05, 'epoch': 0.31}
+
+ 33%|███▎      | 342/1024 [14:28:43<31:56:39, 168.62s/it][AINFO 12-01 10:00:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:00:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:00:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:00:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 33%|███▎      | 343/1024 [14:31:30<31:46:29, 167.97s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011029327288269997, 'learning_rate': 1e-05, 'num_tokens': 282061536.0, 'completions/mean_length': 6864.53125, 'completions/min_length': 1049.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6396.3603515625, 'completions/min_terminated_length': 1049.0, 'completions/max_terminated_length': 15605.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.3874102830886841, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017977923154830933, 'sampling/sampling_logp_difference/max': 4.527283191680908, 'sampling/importance_sampling_ratio/min': 0.010810005478560925, 'sampling/importance_sampling_ratio/mean': 1.0000135898590088, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3497289400183945e-05, 'epoch': 0.32}
+
+ 33%|███▎      | 343/1024 [14:31:30<31:46:29, 167.97s/it][AINFO 12-01 10:02:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:02:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:02:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:02:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▎      | 344/1024 [14:34:17<31:41:10, 167.75s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.0017053206684067845, 'learning_rate': 1e-05, 'num_tokens': 283027993.0, 'completions/mean_length': 7401.0078125, 'completions/min_length': 1339.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7185.41650390625, 'completions/min_terminated_length': 1339.0, 'completions/max_terminated_length': 16233.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02100560814142227, 'sampling/sampling_logp_difference/max': 11.872836112976074, 'sampling/importance_sampling_ratio/min': 6.9773868744960055e-06, 'sampling/importance_sampling_ratio/mean': 1.0000090599060059, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.970376699020562e-05, 'epoch': 0.32}
+
+ 34%|███▎      | 344/1024 [14:34:17<31:41:10, 167.75s/it][AINFO 12-01 10:05:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:05:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:05:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:05:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▎      | 345/1024 [14:36:45<30:32:17, 161.91s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003388978075236082, 'learning_rate': 1e-05, 'num_tokens': 283775920.0, 'completions/mean_length': 5675.5546875, 'completions/min_length': 529.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5591.236328125, 'completions/min_terminated_length': 529.0, 'completions/max_terminated_length': 15658.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.36797162890434265, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019859781488776207, 'sampling/sampling_logp_difference/max': 2.5615286827087402, 'sampling/importance_sampling_ratio/min': 0.07718665897846222, 'sampling/importance_sampling_ratio/mean': 0.99998539686203, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.342821973248647e-05, 'epoch': 0.32}
+
+ 34%|███▎      | 345/1024 [14:36:45<30:32:17, 161.91s/it][AINFO 12-01 10:08:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:08:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:08:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:08:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▍      | 346/1024 [14:39:40<31:11:38, 165.63s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013408261584118009, 'learning_rate': 1e-05, 'num_tokens': 284719340.0, 'completions/mean_length': 7220.21875, 'completions/min_length': 406.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7074.76220703125, 'completions/min_terminated_length': 406.0, 'completions/max_terminated_length': 14533.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02152540162205696, 'sampling/sampling_logp_difference/max': 14.193070411682129, 'sampling/importance_sampling_ratio/min': 6.855321998955333e-07, 'sampling/importance_sampling_ratio/mean': 0.9995501041412354, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6612543428636855e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 346/1024 [14:39:40<31:11:38, 165.63s/it][AINFO 12-01 10:10:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:10:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:10:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:10:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▍      | 347/1024 [14:42:27<31:14:17, 166.11s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016646332805976272, 'learning_rate': 1e-05, 'num_tokens': 285593059.0, 'completions/mean_length': 6673.7421875, 'completions/min_length': 317.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6279.01611328125, 'completions/min_terminated_length': 317.0, 'completions/max_terminated_length': 15487.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.26827272772789, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02121761441230774, 'sampling/sampling_logp_difference/max': 7.59326171875, 'sampling/importance_sampling_ratio/min': 0.0005038350354880095, 'sampling/importance_sampling_ratio/mean': 0.9999375939369202, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1584786913517746e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 347/1024 [14:42:27<31:14:17, 166.11s/it][AINFO 12-01 10:13:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:13:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:13:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:13:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▍      | 348/1024 [14:45:05<30:45:12, 163.78s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002480244729667902, 'learning_rate': 1e-05, 'num_tokens': 286517688.0, 'completions/mean_length': 7073.2265625, 'completions/min_length': 1377.0, 'completions/max_length': 16283.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7073.2265625, 'completions/min_terminated_length': 1377.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3095887303352356, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021183747798204422, 'sampling/sampling_logp_difference/max': 2.146350383758545, 'sampling/importance_sampling_ratio/min': 0.11691005527973175, 'sampling/importance_sampling_ratio/mean': 0.9999927282333374, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.4839598419675895e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 348/1024 [14:45:05<30:45:12, 163.78s/it][AINFO 12-01 10:16:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:16:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:16:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:16:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▍      | 349/1024 [14:47:41<30:15:22, 161.37s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001381990616209805, 'learning_rate': 1e-05, 'num_tokens': 287284127.0, 'completions/mean_length': 5801.8671875, 'completions/min_length': 1112.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5633.89697265625, 'completions/min_terminated_length': 1112.0, 'completions/max_terminated_length': 15574.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.17859892547130585, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01844586431980133, 'sampling/sampling_logp_difference/max': 1.4857771396636963, 'sampling/importance_sampling_ratio/min': 0.22632639110088348, 'sampling/importance_sampling_ratio/mean': 1.0000414848327637, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.6604004233377054e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 349/1024 [14:47:41<30:15:22, 161.37s/it][AINFO 12-01 10:18:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:18:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:18:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:18:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▍      | 350/1024 [14:50:04<29:10:43, 155.85s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018281774828210473, 'learning_rate': 1e-05, 'num_tokens': 288070785.0, 'completions/mean_length': 5989.890625, 'completions/min_length': 787.0, 'completions/max_length': 15079.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5989.890625, 'completions/min_terminated_length': 787.0, 'completions/max_terminated_length': 15079.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021171819418668747, 'sampling/sampling_logp_difference/max': 2.203883171081543, 'sampling/importance_sampling_ratio/min': 0.11037372797727585, 'sampling/importance_sampling_ratio/mean': 1.0000755786895752, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.665044761826721e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 350/1024 [14:50:04<29:10:43, 155.85s/it][AINFO 12-01 10:21:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:21:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:21:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:21:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▍      | 351/1024 [14:52:39<29:05:54, 155.65s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0036342237144708633, 'learning_rate': 1e-05, 'num_tokens': 288865095.0, 'completions/mean_length': 6067.296875, 'completions/min_length': 376.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5734.5, 'completions/min_terminated_length': 376.0, 'completions/max_terminated_length': 16040.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.32589423656463623, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021084293723106384, 'sampling/sampling_logp_difference/max': 7.809140682220459, 'sampling/importance_sampling_ratio/min': 0.0004060067585669458, 'sampling/importance_sampling_ratio/mean': 0.9999662637710571, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.7183981728740036e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 351/1024 [14:52:39<29:05:54, 155.65s/it][AINFO 12-01 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:23:56 [block_pool.py:292] Successfully reset prefix cache
+
+ 34%|███▍      | 352/1024 [14:55:22<29:27:59, 157.86s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001292324042879045, 'learning_rate': 1e-05, 'num_tokens': 289823850.0, 'completions/mean_length': 7338.7734375, 'completions/min_length': 1014.0, 'completions/max_length': 15398.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7338.7734375, 'completions/min_terminated_length': 1014.0, 'completions/max_terminated_length': 15398.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019760509952902794, 'sampling/sampling_logp_difference/max': 2.953073501586914, 'sampling/importance_sampling_ratio/min': 0.052179086953401566, 'sampling/importance_sampling_ratio/mean': 0.9999390840530396, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.327032133413013e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 352/1024 [14:55:22<29:27:59, 157.86s/it][AINFO 12-01 10:26:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:26:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:26:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:26:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 34%|███▍      | 353/1024 [14:58:10<29:59:03, 160.87s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.004251889884471893, 'learning_rate': 1e-05, 'num_tokens': 290706410.0, 'completions/mean_length': 6743.4375, 'completions/min_length': 621.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6590.4130859375, 'completions/min_terminated_length': 621.0, 'completions/max_terminated_length': 15132.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.41398313641548157, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01994343101978302, 'sampling/sampling_logp_difference/max': 4.931114673614502, 'sampling/importance_sampling_ratio/min': 0.007218452636152506, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.485422500190907e-05, 'epoch': 0.32}
+
+ 34%|███▍      | 353/1024 [14:58:10<29:59:03, 160.87s/it][AINFO 12-01 10:29:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:29:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:29:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:29:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 35%|███▍      | 354/1024 [15:01:15<31:18:35, 168.23s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021808533929288387, 'learning_rate': 1e-05, 'num_tokens': 291781031.0, 'completions/mean_length': 8236.6015625, 'completions/min_length': 508.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7835.9091796875, 'completions/min_terminated_length': 508.0, 'completions/max_terminated_length': 16140.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.4189956784248352, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.020466210320591927, 'sampling/sampling_logp_difference/max': 11.18420124053955, 'sampling/importance_sampling_ratio/min': 1.3891946764488239e-05, 'sampling/importance_sampling_ratio/mean': 1.000045895576477, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.773642912274227e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 354/1024 [15:01:15<31:18:35, 168.23s/it][AINFO 12-01 10:32:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:32:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:32:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:32:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 355/1024 [15:04:18<32:02:01, 172.38s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0015350021421909332, 'learning_rate': 1e-05, 'num_tokens': 292686178.0, 'completions/mean_length': 6895.8984375, 'completions/min_length': 267.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6745.2939453125, 'completions/min_terminated_length': 267.0, 'completions/max_terminated_length': 16092.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.3214361071586609, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021501120179891586, 'sampling/sampling_logp_difference/max': 2.7824230194091797, 'sampling/importance_sampling_ratio/min': 0.06188836693763733, 'sampling/importance_sampling_ratio/mean': 1.0000672340393066, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.883610735210823e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 355/1024 [15:04:18<32:02:01, 172.38s/it][AINFO 12-01 10:35:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:35:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:35:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:35:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▍      | 356/1024 [15:06:48<30:45:31, 165.77s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0021845607552677393, 'learning_rate': 1e-05, 'num_tokens': 293626136.0, 'completions/mean_length': 7186.171875, 'completions/min_length': 1026.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7113.748046875, 'completions/min_terminated_length': 1026.0, 'completions/max_terminated_length': 16173.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2930282652378082, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020528484135866165, 'sampling/sampling_logp_difference/max': 8.734874725341797, 'sampling/importance_sampling_ratio/min': 0.00016087631229311228, 'sampling/importance_sampling_ratio/mean': 1.0000524520874023, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.318942491314374e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 356/1024 [15:06:48<30:45:31, 165.77s/it][AINFO 12-01 10:38:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:38:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:38:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:38:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 35%|███▍      | 357/1024 [15:09:22<30:02:23, 162.13s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.0019846707582473755, 'learning_rate': 1e-05, 'num_tokens': 294405762.0, 'completions/mean_length': 5942.953125, 'completions/min_length': 753.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5777.22265625, 'completions/min_terminated_length': 753.0, 'completions/max_terminated_length': 16130.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021265268325805664, 'sampling/sampling_logp_difference/max': 3.8548974990844727, 'sampling/importance_sampling_ratio/min': 0.021175773814320564, 'sampling/importance_sampling_ratio/mean': 0.9999542832374573, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.268637809445863e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 357/1024 [15:09:22<30:02:23, 162.13s/it][AINFO 12-01 10:40:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:40:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:40:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:40:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 35%|███▍      | 358/1024 [15:12:00<29:47:49, 161.07s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019030128605663776, 'learning_rate': 1e-05, 'num_tokens': 295246567.0, 'completions/mean_length': 6414.1640625, 'completions/min_length': 442.0, 'completions/max_length': 16083.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6414.1640625, 'completions/min_terminated_length': 442.0, 'completions/max_terminated_length': 16083.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.31823596358299255, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018293585628271103, 'sampling/sampling_logp_difference/max': 2.3497424125671387, 'sampling/importance_sampling_ratio/min': 0.13930588960647583, 'sampling/importance_sampling_ratio/mean': 1.0000722408294678, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.296732476585021e-05, 'epoch': 0.33}
+
+ 35%|███▍      | 358/1024 [15:12:00<29:47:49, 161.07s/it][AINFO 12-01 10:43:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:43:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:43:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:43:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 35%|███▌      | 359/1024 [15:14:40<29:42:32, 160.83s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018524479819461703, 'learning_rate': 1e-05, 'num_tokens': 296228363.0, 'completions/mean_length': 7519.03125, 'completions/min_length': 849.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7233.064453125, 'completions/min_terminated_length': 849.0, 'completions/max_terminated_length': 15971.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.4092699885368347, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.021469270810484886, 'sampling/sampling_logp_difference/max': 2.9971799850463867, 'sampling/importance_sampling_ratio/min': 0.04992767050862312, 'sampling/importance_sampling_ratio/mean': 1.000009536743164, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.200569982363959e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 359/1024 [15:14:40<29:42:32, 160.83s/it][AINFO 12-01 10:45:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:45:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:45:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:45:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 35%|███▌      | 360/1024 [15:17:35<30:25:56, 165.00s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.005468362011015415, 'learning_rate': 1e-05, 'num_tokens': 297139692.0, 'completions/mean_length': 6970.1953125, 'completions/min_length': 446.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6587.52001953125, 'completions/min_terminated_length': 446.0, 'completions/max_terminated_length': 16384.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2419992983341217, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020559413358569145, 'sampling/sampling_logp_difference/max': 2.7836108207702637, 'sampling/importance_sampling_ratio/min': 0.06181490048766136, 'sampling/importance_sampling_ratio/mean': 1.000059962272644, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9074841222609393e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 360/1024 [15:17:35<30:25:56, 165.00s/it][AINFO 12-01 10:48:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:48:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:48:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:48:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 35%|███▌      | 361/1024 [15:20:05<29:34:42, 160.61s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019971607252955437, 'learning_rate': 1e-05, 'num_tokens': 297997985.0, 'completions/mean_length': 6547.6640625, 'completions/min_length': 561.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6391.5322265625, 'completions/min_terminated_length': 561.0, 'completions/max_terminated_length': 15614.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0190857145935297, 'sampling/sampling_logp_difference/max': 3.2685952186584473, 'sampling/importance_sampling_ratio/min': 0.03805985301733017, 'sampling/importance_sampling_ratio/mean': 1.0000401735305786, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.905054129267228e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 361/1024 [15:20:05<29:34:42, 160.61s/it][AINFO 12-01 10:51:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:51:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:51:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:51:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 35%|███▌      | 362/1024 [15:22:57<30:09:27, 164.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001602941774763167, 'learning_rate': 1e-05, 'num_tokens': 299103696.0, 'completions/mean_length': 8481.1171875, 'completions/min_length': 731.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 8023.92529296875, 'completions/min_terminated_length': 731.0, 'completions/max_terminated_length': 16381.0, 'rewards/accuracy_reward/mean': 0.21875, 'rewards/accuracy_reward/std': 0.41502299904823303, 'reward': 0.21875, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020526811480522156, 'sampling/sampling_logp_difference/max': 3.933558225631714, 'sampling/importance_sampling_ratio/min': 0.01957390084862709, 'sampling/importance_sampling_ratio/mean': 0.9999719262123108, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.211217858734017e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 362/1024 [15:22:57<30:09:27, 164.00s/it][AINFO 12-01 10:54:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:54:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:54:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:54:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 35%|███▌      | 363/1024 [15:25:47<30:26:19, 165.78s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0029093201737850904, 'learning_rate': 1e-05, 'num_tokens': 300087794.0, 'completions/mean_length': 7525.453125, 'completions/min_length': 834.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7312.84814453125, 'completions/min_terminated_length': 834.0, 'completions/max_terminated_length': 15957.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2227931022644043, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01915670931339264, 'sampling/sampling_logp_difference/max': 1.6407349109649658, 'sampling/importance_sampling_ratio/min': 0.19383752346038818, 'sampling/importance_sampling_ratio/mean': 0.9999940395355225, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.070032287017966e-05, 'epoch': 0.33}
+
+ 35%|███▌      | 363/1024 [15:25:47<30:26:19, 165.78s/it][AINFO 12-01 10:57:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:57:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:57:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 10:57:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 364/1024 [15:28:52<31:25:39, 171.42s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019376726122573018, 'learning_rate': 1e-05, 'num_tokens': 301064902.0, 'completions/mean_length': 7493.46875, 'completions/min_length': 1485.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6979.14013671875, 'completions/min_terminated_length': 1485.0, 'completions/max_terminated_length': 15424.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2811809182167053, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01852152682840824, 'sampling/sampling_logp_difference/max': 5.181365013122559, 'sampling/importance_sampling_ratio/min': 0.005620329640805721, 'sampling/importance_sampling_ratio/mean': 1.0000510215759277, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.317853111184377e-05, 'epoch': 0.33}
+
+ 36%|███▌      | 364/1024 [15:28:52<31:25:39, 171.42s/it][AINFO 12-01 11:00:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:00:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:00:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:00:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 365/1024 [15:31:37<31:03:26, 169.66s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002380593679845333, 'learning_rate': 1e-05, 'num_tokens': 302016501.0, 'completions/mean_length': 7294.1171875, 'completions/min_length': 1133.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7222.54345703125, 'completions/min_terminated_length': 1133.0, 'completions/max_terminated_length': 15908.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019933555275201797, 'sampling/sampling_logp_difference/max': 2.799295425415039, 'sampling/importance_sampling_ratio/min': 0.06085292249917984, 'sampling/importance_sampling_ratio/mean': 1.0000369548797607, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.223492346180137e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 365/1024 [15:31:37<31:03:26, 169.66s/it][AINFO 12-01 11:02:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:02:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:02:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:02:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 366/1024 [15:34:40<31:42:31, 173.48s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.004880616441369057, 'learning_rate': 1e-05, 'num_tokens': 302923225.0, 'completions/mean_length': 6942.09375, 'completions/min_length': 786.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6395.86767578125, 'completions/min_terminated_length': 786.0, 'completions/max_terminated_length': 16328.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2596156895160675, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01911761239171028, 'sampling/sampling_logp_difference/max': 2.1653690338134766, 'sampling/importance_sampling_ratio/min': 0.1147075966000557, 'sampling/importance_sampling_ratio/mean': 0.9999622106552124, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.759958958151401e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 366/1024 [15:34:40<31:42:31, 173.48s/it][AINFO 12-01 11:05:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:05:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:05:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:05:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 367/1024 [15:37:14<30:36:27, 167.71s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0013466214295476675, 'learning_rate': 1e-05, 'num_tokens': 303809465.0, 'completions/mean_length': 6781.625, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6706.015625, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 15946.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01744498312473297, 'sampling/sampling_logp_difference/max': 2.629667282104492, 'sampling/importance_sampling_ratio/min': 0.07210244983434677, 'sampling/importance_sampling_ratio/mean': 1.0000284910202026, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.303603009248036e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 367/1024 [15:37:14<30:36:27, 167.71s/it][AINFO 12-01 11:08:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:08:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:08:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:08:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 368/1024 [15:39:57<30:18:16, 166.31s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0024491064250469208, 'learning_rate': 1e-05, 'num_tokens': 304778129.0, 'completions/mean_length': 7428.3125, 'completions/min_length': 1033.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7286.1591796875, 'completions/min_terminated_length': 1033.0, 'completions/max_terminated_length': 16372.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3090519309043884, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021604089066386223, 'sampling/sampling_logp_difference/max': 3.323939800262451, 'sampling/importance_sampling_ratio/min': 0.036010678857564926, 'sampling/importance_sampling_ratio/mean': 0.9999817609786987, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.93168658472132e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 368/1024 [15:39:57<30:18:16, 166.31s/it][AINFO 12-01 11:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:11:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:11:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 369/1024 [15:42:31<29:35:26, 162.64s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.003240464720875025, 'learning_rate': 1e-05, 'num_tokens': 305545408.0, 'completions/mean_length': 5864.3671875, 'completions/min_length': 728.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5525.02392578125, 'completions/min_terminated_length': 728.0, 'completions/max_terminated_length': 15043.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01894432306289673, 'sampling/sampling_logp_difference/max': 2.0896401405334473, 'sampling/importance_sampling_ratio/min': 0.12373165041208267, 'sampling/importance_sampling_ratio/mean': 0.9999258518218994, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.0599821836149204e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 369/1024 [15:42:31<29:35:26, 162.64s/it][AINFO 12-01 11:13:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:13:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:13:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:13:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 370/1024 [15:45:19<29:49:44, 164.20s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0019397672731429338, 'learning_rate': 1e-05, 'num_tokens': 306621134.0, 'completions/mean_length': 8232.296875, 'completions/min_length': 235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7969.33837890625, 'completions/min_terminated_length': 235.0, 'completions/max_terminated_length': 15891.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2988022267818451, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02167891152203083, 'sampling/sampling_logp_difference/max': 8.486870765686035, 'sampling/importance_sampling_ratio/min': 0.00020615736139006913, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 9.145735066340421e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 370/1024 [15:45:19<29:49:44, 164.20s/it][AINFO 12-01 11:16:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:16:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:16:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:16:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▌      | 371/1024 [15:48:17<30:32:12, 168.35s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0031762919388711452, 'learning_rate': 1e-05, 'num_tokens': 307606196.0, 'completions/mean_length': 7559.421875, 'completions/min_length': 1511.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7347.63232421875, 'completions/min_terminated_length': 1511.0, 'completions/max_terminated_length': 15695.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018857207149267197, 'sampling/sampling_logp_difference/max': 3.7277402877807617, 'sampling/importance_sampling_ratio/min': 0.024047115817666054, 'sampling/importance_sampling_ratio/mean': 0.9999840259552002, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.436190849650302e-05, 'epoch': 0.34}
+
+ 36%|███▌      | 371/1024 [15:48:17<30:32:12, 168.35s/it][AINFO 12-01 11:19:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:19:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:19:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:19:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▋      | 372/1024 [15:51:01<30:13:42, 166.91s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014498669188469648, 'learning_rate': 1e-05, 'num_tokens': 308580903.0, 'completions/mean_length': 7457.0234375, 'completions/min_length': 1283.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7169.05615234375, 'completions/min_terminated_length': 1283.0, 'completions/max_terminated_length': 16163.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02182072401046753, 'sampling/sampling_logp_difference/max': 2.712770938873291, 'sampling/importance_sampling_ratio/min': 0.06635269522666931, 'sampling/importance_sampling_ratio/mean': 0.9999907612800598, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.608365545915149e-05, 'epoch': 0.34}
+
+ 36%|███▋      | 372/1024 [15:51:01<30:13:42, 166.91s/it][AINFO 12-01 11:22:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:22:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:22:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:22:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 36%|███▋      | 373/1024 [15:53:21<28:44:49, 158.97s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0015558591112494469, 'learning_rate': 1e-05, 'num_tokens': 309478450.0, 'completions/mean_length': 6852.5859375, 'completions/min_length': 1680.0, 'completions/max_length': 15670.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6852.5859375, 'completions/min_terminated_length': 1680.0, 'completions/max_terminated_length': 15670.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021383441984653473, 'sampling/sampling_logp_difference/max': 2.9336295127868652, 'sampling/importance_sampling_ratio/min': 0.053203582763671875, 'sampling/importance_sampling_ratio/mean': 0.9999746680259705, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.586268348954036e-05, 'epoch': 0.34}
+
+ 36%|███▋      | 373/1024 [15:53:21<28:44:49, 158.97s/it][AINFO 12-01 11:24:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:24:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:24:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:24:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 374/1024 [15:56:08<29:08:26, 161.40s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001840326003730297, 'learning_rate': 1e-05, 'num_tokens': 310434624.0, 'completions/mean_length': 7332.421875, 'completions/min_length': 1025.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7188.74658203125, 'completions/min_terminated_length': 1025.0, 'completions/max_terminated_length': 16170.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3043339252471924, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01931198127567768, 'sampling/sampling_logp_difference/max': 3.6437888145446777, 'sampling/importance_sampling_ratio/min': 0.026153067126870155, 'sampling/importance_sampling_ratio/mean': 0.9999426603317261, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.059259729023324e-05, 'epoch': 0.34}
+
+ 37%|███▋      | 374/1024 [15:56:08<29:08:26, 161.40s/it][AINFO 12-01 11:27:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:27:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:27:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:27:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 375/1024 [15:58:43<28:43:52, 159.37s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001488545211032033, 'learning_rate': 1e-05, 'num_tokens': 311351061.0, 'completions/mean_length': 7012.6015625, 'completions/min_length': 1561.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6938.81103515625, 'completions/min_terminated_length': 1561.0, 'completions/max_terminated_length': 15596.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.21959787607192993, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0206888560205698, 'sampling/sampling_logp_difference/max': 10.93203067779541, 'sampling/importance_sampling_ratio/min': 1.7876371202873997e-05, 'sampling/importance_sampling_ratio/mean': 1.0000215768814087, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0930035904930264e-05, 'epoch': 0.34}
+
+ 37%|███▋      | 375/1024 [15:58:43<28:43:52, 159.37s/it][AINFO 12-01 11:29:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:29:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:29:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:29:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 376/1024 [16:01:11<28:06:30, 156.16s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0031663822010159492, 'learning_rate': 1e-05, 'num_tokens': 312120571.0, 'completions/mean_length': 5867.234375, 'completions/min_length': 987.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5784.42529296875, 'completions/min_terminated_length': 987.0, 'completions/max_terminated_length': 15527.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3703257441520691, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02000812068581581, 'sampling/sampling_logp_difference/max': 1.6203699111938477, 'sampling/importance_sampling_ratio/min': 0.19782550632953644, 'sampling/importance_sampling_ratio/mean': 0.9999905824661255, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.62690695207857e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 376/1024 [16:01:11<28:06:30, 156.16s/it][AINFO 12-01 11:32:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:32:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:32:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:32:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 377/1024 [16:03:55<28:26:39, 158.27s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002196704037487507, 'learning_rate': 1e-05, 'num_tokens': 312979051.0, 'completions/mean_length': 6541.1875, 'completions/min_length': 474.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6384.95263671875, 'completions/min_terminated_length': 474.0, 'completions/max_terminated_length': 16286.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02040308713912964, 'sampling/sampling_logp_difference/max': 1.880664348602295, 'sampling/importance_sampling_ratio/min': 0.15248876810073853, 'sampling/importance_sampling_ratio/mean': 0.999919593334198, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.29714252732083e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 377/1024 [16:03:55<28:26:39, 158.27s/it][AINFO 12-01 11:35:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:35:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:35:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:35:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 378/1024 [16:06:47<29:08:03, 162.36s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016656353836879134, 'learning_rate': 1e-05, 'num_tokens': 314032065.0, 'completions/mean_length': 8064.046875, 'completions/min_length': 1101.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7864.3681640625, 'completions/min_terminated_length': 1101.0, 'completions/max_terminated_length': 16144.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.19780512154102325, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02024432271718979, 'sampling/sampling_logp_difference/max': 2.259089469909668, 'sampling/importance_sampling_ratio/min': 0.10444553941488266, 'sampling/importance_sampling_ratio/mean': 0.9999909996986389, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.2824518453035125e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 378/1024 [16:06:47<29:08:03, 162.36s/it][AINFO 12-01 11:38:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:38:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:38:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:38:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 379/1024 [16:09:39<29:37:28, 165.35s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001331070438027382, 'learning_rate': 1e-05, 'num_tokens': 315040933.0, 'completions/mean_length': 7727.28125, 'completions/min_length': 854.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7448.0322265625, 'completions/min_terminated_length': 854.0, 'completions/max_terminated_length': 16016.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.15650182962417603, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02078399248421192, 'sampling/sampling_logp_difference/max': 3.1385183334350586, 'sampling/importance_sampling_ratio/min': 0.04334697499871254, 'sampling/importance_sampling_ratio/mean': 1.000110387802124, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.1671107958809444e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 379/1024 [16:09:39<29:37:28, 165.35s/it][AINFO 12-01 11:40:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:40:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:40:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:40:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 37%|███▋      | 380/1024 [16:12:29<29:49:37, 166.74s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001208718866109848, 'learning_rate': 1e-05, 'num_tokens': 316044065.0, 'completions/mean_length': 7652.34375, 'completions/min_length': 525.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7370.67724609375, 'completions/min_terminated_length': 525.0, 'completions/max_terminated_length': 15666.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3122295141220093, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019106945022940636, 'sampling/sampling_logp_difference/max': 1.9319443702697754, 'sampling/importance_sampling_ratio/min': 0.14486625790596008, 'sampling/importance_sampling_ratio/mean': 0.99993896484375, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.303245006871293e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 380/1024 [16:12:29<29:49:37, 166.74s/it][AINFO 12-01 11:43:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:43:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:43:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:43:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 381/1024 [16:15:21<30:03:57, 168.33s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018009255873039365, 'learning_rate': 1e-05, 'num_tokens': 317066400.0, 'completions/mean_length': 7846.6171875, 'completions/min_length': 900.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7641.72021484375, 'completions/min_terminated_length': 900.0, 'completions/max_terminated_length': 14979.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.18884867429733276, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020989827811717987, 'sampling/sampling_logp_difference/max': 3.798849582672119, 'sampling/importance_sampling_ratio/min': 0.022396523505449295, 'sampling/importance_sampling_ratio/mean': 1.0000481605529785, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5691803279623855e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 381/1024 [16:15:21<30:03:57, 168.33s/it][AINFO 12-01 11:46:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:46:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:46:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:46:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 382/1024 [16:18:00<29:32:54, 165.69s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009951089741662145, 'learning_rate': 1e-05, 'num_tokens': 318101862.0, 'completions/mean_length': 7920.796875, 'completions/min_length': 1042.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7647.7900390625, 'completions/min_terminated_length': 1042.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.20463499426841736, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021802615374326706, 'sampling/sampling_logp_difference/max': 2.757632255554199, 'sampling/importance_sampling_ratio/min': 0.06344180554151535, 'sampling/importance_sampling_ratio/mean': 1.0000214576721191, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4224756348066876e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 382/1024 [16:18:00<29:32:54, 165.69s/it][AINFO 12-01 11:49:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:49:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:49:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:49:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 37%|███▋      | 383/1024 [16:20:53<29:51:28, 167.69s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.004091652110219002, 'learning_rate': 1e-05, 'num_tokens': 319042289.0, 'completions/mean_length': 7155.2109375, 'completions/min_length': 968.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7082.54345703125, 'completions/min_terminated_length': 968.0, 'completions/max_terminated_length': 15603.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.34245961904525757, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020797304809093475, 'sampling/sampling_logp_difference/max': 2.2164440155029297, 'sampling/importance_sampling_ratio/min': 0.13563524186611176, 'sampling/importance_sampling_ratio/mean': 0.9999948143959045, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2535546046783566e-05, 'epoch': 0.35}
+
+ 37%|███▋      | 383/1024 [16:20:53<29:51:28, 167.69s/it][AINFO 12-01 11:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:52:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 384/1024 [16:24:06<31:10:58, 175.40s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002398962853476405, 'learning_rate': 1e-05, 'num_tokens': 320012148.0, 'completions/mean_length': 7419.3359375, 'completions/min_length': 776.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6978.4501953125, 'completions/min_terminated_length': 776.0, 'completions/max_terminated_length': 16181.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2585548758506775, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020775236189365387, 'sampling/sampling_logp_difference/max': 9.699130058288574, 'sampling/importance_sampling_ratio/min': 6.133683200459927e-05, 'sampling/importance_sampling_ratio/mean': 0.9999755620956421, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.486820694182825e-05, 'epoch': 0.35}
+
+ 38%|███▊      | 384/1024 [16:24:06<31:10:58, 175.40s/it][AINFO 12-01 11:55:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:55:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:55:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:55:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 385/1024 [16:27:14<31:48:08, 179.17s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0020817951299250126, 'learning_rate': 1e-05, 'num_tokens': 321059838.0, 'completions/mean_length': 8010.328125, 'completions/min_length': 597.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7452.08349609375, 'completions/min_terminated_length': 597.0, 'completions/max_terminated_length': 15906.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3322049677371979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02137676067650318, 'sampling/sampling_logp_difference/max': 5.814737319946289, 'sampling/importance_sampling_ratio/min': 0.0029832636937499046, 'sampling/importance_sampling_ratio/mean': 0.9999976754188538, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.329194497491699e-05, 'epoch': 0.35}
+
+ 38%|███▊      | 385/1024 [16:27:14<31:48:08, 179.17s/it][AINFO 12-01 11:58:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:58:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:58:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 11:58:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 386/1024 [16:29:25<29:10:23, 164.61s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022205435670912266, 'learning_rate': 1e-05, 'num_tokens': 321840021.0, 'completions/mean_length': 5945.0546875, 'completions/min_length': 1206.0, 'completions/max_length': 15518.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5945.0546875, 'completions/min_terminated_length': 1206.0, 'completions/max_terminated_length': 15518.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.228030264377594, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017921704798936844, 'sampling/sampling_logp_difference/max': 5.36189603805542, 'sampling/importance_sampling_ratio/min': 0.004692001733928919, 'sampling/importance_sampling_ratio/mean': 1.0000481605529785, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.981734587592655e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 386/1024 [16:29:25<29:10:23, 164.61s/it][AINFO 12-01 12:00:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:00:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:00:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:00:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 387/1024 [16:32:06<28:56:02, 163.52s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007956126355566084, 'learning_rate': 1e-05, 'num_tokens': 322861000.0, 'completions/mean_length': 7821.3359375, 'completions/min_length': 978.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7753.91357421875, 'completions/min_terminated_length': 978.0, 'completions/max_terminated_length': 15668.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2685721516609192, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021585073322057724, 'sampling/sampling_logp_difference/max': 2.5420260429382324, 'sampling/importance_sampling_ratio/min': 0.0787067711353302, 'sampling/importance_sampling_ratio/mean': 0.9999861121177673, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.438731866684975e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 387/1024 [16:32:06<28:56:02, 163.52s/it][AINFO 12-01 12:03:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:03:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:03:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:03:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 388/1024 [16:35:06<29:46:58, 168.58s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020157224498689175, 'learning_rate': 1e-05, 'num_tokens': 323903490.0, 'completions/mean_length': 7991.203125, 'completions/min_length': 926.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7356.4541015625, 'completions/min_terminated_length': 926.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3287900388240814, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01778440549969673, 'sampling/sampling_logp_difference/max': 2.4291903972625732, 'sampling/importance_sampling_ratio/min': 0.0881081372499466, 'sampling/importance_sampling_ratio/mean': 1.0000150203704834, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.452533943956951e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 388/1024 [16:35:06<29:46:58, 168.58s/it][AINFO 12-01 12:06:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:06:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:06:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:06:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 389/1024 [16:37:56<29:47:57, 168.94s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001724523608572781, 'learning_rate': 1e-05, 'num_tokens': 324928331.0, 'completions/mean_length': 7865.0703125, 'completions/min_length': 475.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7590.26611328125, 'completions/min_terminated_length': 475.0, 'completions/max_terminated_length': 16165.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02049725130200386, 'sampling/sampling_logp_difference/max': 4.513208389282227, 'sampling/importance_sampling_ratio/min': 0.010963229462504387, 'sampling/importance_sampling_ratio/mean': 1.0000388622283936, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.584039958719586e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 389/1024 [16:37:56<29:47:57, 168.94s/it][AINFO 12-01 12:09:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:09:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:09:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:09:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 390/1024 [16:40:42<29:35:28, 168.03s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.000601888750679791, 'learning_rate': 1e-05, 'num_tokens': 325795165.0, 'completions/mean_length': 6631.328125, 'completions/min_length': 857.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6476.5244140625, 'completions/min_terminated_length': 857.0, 'completions/max_terminated_length': 16274.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.15650184452533722, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02086561731994152, 'sampling/sampling_logp_difference/max': 1.4085359573364258, 'sampling/importance_sampling_ratio/min': 0.244500994682312, 'sampling/importance_sampling_ratio/mean': 1.0001271963119507, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.333021709295281e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 390/1024 [16:40:42<29:35:28, 168.03s/it][AINFO 12-01 12:11:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:11:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:11:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:11:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 391/1024 [16:43:21<29:04:50, 165.39s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0021767502184957266, 'learning_rate': 1e-05, 'num_tokens': 326819353.0, 'completions/mean_length': 7832.65625, 'completions/min_length': 924.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7627.42431640625, 'completions/min_terminated_length': 924.0, 'completions/max_terminated_length': 16302.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02070239745080471, 'sampling/sampling_logp_difference/max': 3.1052417755126953, 'sampling/importance_sampling_ratio/min': 0.04481368511915207, 'sampling/importance_sampling_ratio/mean': 0.9999992847442627, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.491334780392208e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 391/1024 [16:43:21<29:04:50, 165.39s/it][AINFO 12-01 12:14:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:14:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:14:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:14:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 38%|███▊      | 392/1024 [16:46:02<28:47:55, 164.04s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0028393822722136974, 'learning_rate': 1e-05, 'num_tokens': 327774851.0, 'completions/mean_length': 7333.140625, 'completions/min_length': 838.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7189.4765625, 'completions/min_terminated_length': 838.0, 'completions/max_terminated_length': 15675.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3253750801086426, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02081885002553463, 'sampling/sampling_logp_difference/max': 2.902829170227051, 'sampling/importance_sampling_ratio/min': 0.05486776679754257, 'sampling/importance_sampling_ratio/mean': 0.9999268054962158, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.412582747041597e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 392/1024 [16:46:02<28:47:55, 164.04s/it][AINFO 12-01 12:17:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:17:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:17:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:17:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 393/1024 [16:48:53<29:08:00, 166.21s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0026299161836504936, 'learning_rate': 1e-05, 'num_tokens': 328697629.0, 'completions/mean_length': 7045.828125, 'completions/min_length': 1551.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6744.5966796875, 'completions/min_terminated_length': 1551.0, 'completions/max_terminated_length': 16112.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.2580229640007019, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020240478217601776, 'sampling/sampling_logp_difference/max': 6.9532151222229, 'sampling/importance_sampling_ratio/min': 0.0009555579745210707, 'sampling/importance_sampling_ratio/mean': 1.0000052452087402, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4505347343838366e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 393/1024 [16:48:53<29:08:00, 166.21s/it][AINFO 12-01 12:20:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:20:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:20:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:20:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 38%|███▊      | 394/1024 [16:51:40<29:06:00, 166.29s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002642236417159438, 'learning_rate': 1e-05, 'num_tokens': 329630831.0, 'completions/mean_length': 7131.765625, 'completions/min_length': 804.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6984.9052734375, 'completions/min_terminated_length': 804.0, 'completions/max_terminated_length': 15257.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0192926824092865, 'sampling/sampling_logp_difference/max': 2.7944908142089844, 'sampling/importance_sampling_ratio/min': 0.09045516699552536, 'sampling/importance_sampling_ratio/mean': 0.9999628067016602, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9053459684291738e-05, 'epoch': 0.36}
+
+ 38%|███▊      | 394/1024 [16:51:40<29:06:00, 166.29s/it][AINFO 12-01 12:22:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:22:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:22:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:22:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 39%|███▊      | 395/1024 [16:54:20<28:45:16, 164.57s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.002113129710778594, 'learning_rate': 1e-05, 'num_tokens': 330511904.0, 'completions/mean_length': 6734.8828125, 'completions/min_length': 279.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6423.62060546875, 'completions/min_terminated_length': 279.0, 'completions/max_terminated_length': 16214.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019051650539040565, 'sampling/sampling_logp_difference/max': 2.1480674743652344, 'sampling/importance_sampling_ratio/min': 0.11670948565006256, 'sampling/importance_sampling_ratio/mean': 1.000004768371582, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.394124109945551e-05, 'epoch': 0.36}
+
+ 39%|███▊      | 395/1024 [16:54:20<28:45:16, 164.57s/it][AINFO 12-01 12:25:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:25:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:25:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:25:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 39%|███▊      | 396/1024 [16:57:20<29:28:37, 168.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021214222069829702, 'learning_rate': 1e-05, 'num_tokens': 331532126.0, 'completions/mean_length': 7821.234375, 'completions/min_length': 1882.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7473.154296875, 'completions/min_terminated_length': 1882.0, 'completions/max_terminated_length': 15598.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.29432153701782227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020253345370292664, 'sampling/sampling_logp_difference/max': 2.4118354320526123, 'sampling/importance_sampling_ratio/min': 0.08965059369802475, 'sampling/importance_sampling_ratio/mean': 0.9999544620513916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.83000740132411e-05, 'epoch': 0.36}
+
+ 39%|███▊      | 396/1024 [16:57:20<29:28:37, 168.98s/it][AINFO 12-01 12:28:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:28:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:28:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:28:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 39%|███▉      | 397/1024 [17:00:06<29:17:34, 168.19s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001391960890032351, 'learning_rate': 1e-05, 'num_tokens': 332402172.0, 'completions/mean_length': 6640.109375, 'completions/min_length': 948.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6563.3857421875, 'completions/min_terminated_length': 948.0, 'completions/max_terminated_length': 15192.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.2845909595489502, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019380351528525352, 'sampling/sampling_logp_difference/max': 2.6286802291870117, 'sampling/importance_sampling_ratio/min': 0.07217364758253098, 'sampling/importance_sampling_ratio/mean': 1.0000360012054443, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.00869573190721e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 397/1024 [17:00:06<29:17:34, 168.19s/it][AINFO 12-01 12:31:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:31:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:31:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:31:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 39%|███▉      | 398/1024 [17:02:52<29:07:42, 167.51s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002047958318144083, 'learning_rate': 1e-05, 'num_tokens': 333328268.0, 'completions/mean_length': 7086.0625, 'completions/min_length': 430.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6938.4765625, 'completions/min_terminated_length': 430.0, 'completions/max_terminated_length': 16181.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.17176413536071777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020471179857850075, 'sampling/sampling_logp_difference/max': 1.4765467643737793, 'sampling/importance_sampling_ratio/min': 0.22842514514923096, 'sampling/importance_sampling_ratio/mean': 0.9999731779098511, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.4781629943172447e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 398/1024 [17:02:52<29:07:42, 167.51s/it][AINFO 12-01 12:34:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:34:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:34:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:34:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+[OpenTinker] 2025-12-01 12:36:08,656 - math_verify.grader - WARNING - Timeout during comparison
+[OpenTinker] 2025-12-01 12:36:13,781 - math_verify.grader - WARNING - Timeout during comparison
+
+ 39%|███▉      | 399/1024 [17:05:58<30:04:31, 173.23s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009592739515937865, 'learning_rate': 1e-05, 'num_tokens': 334371848.0, 'completions/mean_length': 7996.03125, 'completions/min_length': 1036.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7655.056640625, 'completions/min_terminated_length': 1036.0, 'completions/max_terminated_length': 16294.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.22673209011554718, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018109828233718872, 'sampling/sampling_logp_difference/max': 4.161067962646484, 'sampling/importance_sampling_ratio/min': 0.01559089869260788, 'sampling/importance_sampling_ratio/mean': 1.0000035762786865, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.019221660200856e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 399/1024 [17:05:58<30:04:31, 173.23s/it][AINFO 12-01 12:37:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:37:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:37:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:37:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 39%|███▉      | 400/1024 [17:08:50<29:58:06, 172.89s/it][A
+                                                         [A{'loss': -0.0002, 'grad_norm': 0.0013220797991380095, 'learning_rate': 1e-05, 'num_tokens': 335386157.0, 'completions/mean_length': 7773.9140625, 'completions/min_length': 26.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7496.1689453125, 'completions/min_terminated_length': 26.0, 'completions/max_terminated_length': 16384.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021365061402320862, 'sampling/sampling_logp_difference/max': 3.3891077041625977, 'sampling/importance_sampling_ratio/min': 0.033738769590854645, 'sampling/importance_sampling_ratio/mean': 0.9999306201934814, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.587874198274221e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 400/1024 [17:08:50<29:58:06, 172.89s/it][AINFO 12-01 12:40:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:40:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:40:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:40:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 401/1024 [17:11:29<29:09:47, 168.52s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001826947322115302, 'learning_rate': 1e-05, 'num_tokens': 336314840.0, 'completions/mean_length': 7114.0859375, 'completions/min_length': 1033.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6891.6083984375, 'completions/min_terminated_length': 1033.0, 'completions/max_terminated_length': 15994.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0198574960231781, 'sampling/sampling_logp_difference/max': 4.040801048278809, 'sampling/importance_sampling_ratio/min': 0.01758338138461113, 'sampling/importance_sampling_ratio/mean': 1.0000356435775757, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.437237873844424e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 401/1024 [17:11:29<29:09:47, 168.52s/it][AINFO 12-01 12:42:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:42:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:42:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:42:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 39%|███▉      | 402/1024 [17:14:24<29:28:51, 170.63s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001366328913718462, 'learning_rate': 1e-05, 'num_tokens': 337318300.0, 'completions/mean_length': 7668.65625, 'completions/min_length': 772.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7164.46240234375, 'completions/min_terminated_length': 772.0, 'completions/max_terminated_length': 14976.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.1990984082221985, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018450815230607986, 'sampling/sampling_logp_difference/max': 2.16837215423584, 'sampling/importance_sampling_ratio/min': 0.11436362564563751, 'sampling/importance_sampling_ratio/mean': 1.0000301599502563, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.297336818126496e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 402/1024 [17:14:24<29:28:51, 170.63s/it][AINFO 12-01 12:45:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:45:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:45:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:45:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 39%|███▉      | 403/1024 [17:17:13<29:20:32, 170.10s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018877164693549275, 'learning_rate': 1e-05, 'num_tokens': 338182151.0, 'completions/mean_length': 6612.7734375, 'completions/min_length': 968.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6378.26416015625, 'completions/min_terminated_length': 968.0, 'completions/max_terminated_length': 14074.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.19097033143043518, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020930606871843338, 'sampling/sampling_logp_difference/max': 3.2264204025268555, 'sampling/importance_sampling_ratio/min': 0.03969934955239296, 'sampling/importance_sampling_ratio/mean': 1.0000003576278687, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.966044303320814e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 403/1024 [17:17:13<29:20:32, 170.10s/it][AINFO 12-01 12:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:48:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:48:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 39%|███▉      | 404/1024 [17:20:01<29:11:16, 169.48s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010692864889279008, 'learning_rate': 1e-05, 'num_tokens': 339189744.0, 'completions/mean_length': 7737.4453125, 'completions/min_length': 1120.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7458.52392578125, 'completions/min_terminated_length': 1120.0, 'completions/max_terminated_length': 14683.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.15650182962417603, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021117107942700386, 'sampling/sampling_logp_difference/max': 2.1971189975738525, 'sampling/importance_sampling_ratio/min': 0.11112283915281296, 'sampling/importance_sampling_ratio/mean': 1.0000300407409668, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.572063610590703e-05, 'epoch': 0.37}
+
+ 39%|███▉      | 404/1024 [17:20:01<29:11:16, 169.48s/it][AINFO 12-01 12:51:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:51:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:51:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:51:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|███▉      | 405/1024 [17:22:41<28:37:13, 166.45s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0034691074397414923, 'learning_rate': 1e-05, 'num_tokens': 340071378.0, 'completions/mean_length': 6734.015625, 'completions/min_length': 870.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6658.03125, 'completions/min_terminated_length': 870.0, 'completions/max_terminated_length': 15563.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02153743803501129, 'sampling/sampling_logp_difference/max': 2.6687488555908203, 'sampling/importance_sampling_ratio/min': 0.06933892518281937, 'sampling/importance_sampling_ratio/mean': 0.9999605417251587, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.9296873044113454e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 405/1024 [17:22:41<28:37:13, 166.45s/it][AINFO 12-01 12:53:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:53:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:53:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:53:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|███▉      | 406/1024 [17:25:34<28:55:36, 168.51s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0011876666685566306, 'learning_rate': 1e-05, 'num_tokens': 341094044.0, 'completions/mean_length': 7808.578125, 'completions/min_length': 502.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7531.951171875, 'completions/min_terminated_length': 502.0, 'completions/max_terminated_length': 16158.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.12415502220392227, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02229788899421692, 'sampling/sampling_logp_difference/max': 8.140464782714844, 'sampling/importance_sampling_ratio/min': 0.0002915016666520387, 'sampling/importance_sampling_ratio/mean': 1.0000410079956055, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.0864376381268812e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 406/1024 [17:25:34<28:55:36, 168.51s/it][AINFO 12-01 12:56:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:56:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:56:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:56:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|███▉      | 407/1024 [17:28:01<27:45:39, 161.98s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019851415418088436, 'learning_rate': 1e-05, 'num_tokens': 341904855.0, 'completions/mean_length': 6180.4609375, 'completions/min_length': 1377.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6100.1181640625, 'completions/min_terminated_length': 1377.0, 'completions/max_terminated_length': 13610.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020720943808555603, 'sampling/sampling_logp_difference/max': 2.493959665298462, 'sampling/importance_sampling_ratio/min': 0.08258231729269028, 'sampling/importance_sampling_ratio/mean': 1.0000265836715698, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.11184025779221e-05, 'epoch': 0.37}
+
+ 40%|███▉      | 407/1024 [17:28:01<27:45:39, 161.98s/it][AINFO 12-01 12:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 12:59:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|███▉      | 408/1024 [17:30:28<26:56:43, 157.47s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001253474154509604, 'learning_rate': 1e-05, 'num_tokens': 342792105.0, 'completions/mean_length': 6765.828125, 'completions/min_length': 975.0, 'completions/max_length': 15649.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6765.828125, 'completions/min_terminated_length': 975.0, 'completions/max_terminated_length': 15649.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02162204682826996, 'sampling/sampling_logp_difference/max': 3.480578899383545, 'sampling/importance_sampling_ratio/min': 0.030789582058787346, 'sampling/importance_sampling_ratio/mean': 1.0000550746917725, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1342131453347974e-05, 'epoch': 0.38}
+
+ 40%|███▉      | 408/1024 [17:30:28<26:56:43, 157.47s/it][AINFO 12-01 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:01:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|███▉      | 409/1024 [17:33:17<27:31:53, 161.16s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002397125121206045, 'learning_rate': 1e-05, 'num_tokens': 343816365.0, 'completions/mean_length': 7849.71875, 'completions/min_length': 1274.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7502.79638671875, 'completions/min_terminated_length': 1274.0, 'completions/max_terminated_length': 15895.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.248829185962677, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020188214257359505, 'sampling/sampling_logp_difference/max': 4.831815719604492, 'sampling/importance_sampling_ratio/min': 0.007972033694386482, 'sampling/importance_sampling_ratio/mean': 1.0000368356704712, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.895732800174301e-05, 'epoch': 0.38}
+
+ 40%|███▉      | 409/1024 [17:33:17<27:31:53, 161.16s/it][AINFO 12-01 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:04:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|████      | 410/1024 [17:35:59<27:31:47, 161.41s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001465218374505639, 'learning_rate': 1e-05, 'num_tokens': 344765822.0, 'completions/mean_length': 7257.4453125, 'completions/min_length': 403.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6963.0400390625, 'completions/min_terminated_length': 403.0, 'completions/max_terminated_length': 16282.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021321166306734085, 'sampling/sampling_logp_difference/max': 3.449896812438965, 'sampling/importance_sampling_ratio/min': 0.03174890950322151, 'sampling/importance_sampling_ratio/mean': 1.0000324249267578, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.900004355728015e-05, 'epoch': 0.38}
+
+ 40%|████      | 410/1024 [17:35:59<27:31:47, 161.41s/it][AINFO 12-01 13:07:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:07:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:07:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:07:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|████      | 411/1024 [17:38:51<27:59:12, 164.36s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.001725564943626523, 'learning_rate': 1e-05, 'num_tokens': 345632224.0, 'completions/mean_length': 6617.703125, 'completions/min_length': 965.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6462.68310546875, 'completions/min_terminated_length': 965.0, 'completions/max_terminated_length': 15633.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01956915482878685, 'sampling/sampling_logp_difference/max': 1.6923768520355225, 'sampling/importance_sampling_ratio/min': 0.18408146500587463, 'sampling/importance_sampling_ratio/mean': 1.0000470876693726, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.2157296573132044e-05, 'epoch': 0.38}
+
+ 40%|████      | 411/1024 [17:38:51<27:59:12, 164.36s/it][AINFO 12-01 13:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:10:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:10:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|████      | 412/1024 [17:41:41<28:16:05, 166.28s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017026748973876238, 'learning_rate': 1e-05, 'num_tokens': 346674212.0, 'completions/mean_length': 7986.53125, 'completions/min_length': 1917.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7715.64501953125, 'completions/min_terminated_length': 1917.0, 'completions/max_terminated_length': 16050.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3088145852088928, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01981796883046627, 'sampling/sampling_logp_difference/max': 8.869597434997559, 'sampling/importance_sampling_ratio/min': 0.00014059917884878814, 'sampling/importance_sampling_ratio/mean': 1.0000841617584229, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.92514363436203e-05, 'epoch': 0.38}
+
+ 40%|████      | 412/1024 [17:41:41<28:16:05, 166.28s/it][AINFO 12-01 13:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:12:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:12:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|████      | 413/1024 [17:44:16<27:37:59, 162.81s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001737804152071476, 'learning_rate': 1e-05, 'num_tokens': 347623771.0, 'completions/mean_length': 7281.4296875, 'completions/min_length': 1367.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7062.96826171875, 'completions/min_terminated_length': 1367.0, 'completions/max_terminated_length': 14129.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2995538115501404, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019691023975610733, 'sampling/sampling_logp_difference/max': 2.74348783493042, 'sampling/importance_sampling_ratio/min': 0.06538444757461548, 'sampling/importance_sampling_ratio/mean': 1.0000442266464233, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.8244820593245095e-05, 'epoch': 0.38}
+
+ 40%|████      | 413/1024 [17:44:16<27:37:59, 162.81s/it][AINFO 12-01 13:15:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:15:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:15:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:15:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 40%|████      | 414/1024 [17:47:04<27:50:14, 164.29s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.00197951914742589, 'learning_rate': 1e-05, 'num_tokens': 348562630.0, 'completions/mean_length': 7160.2109375, 'completions/min_length': 1280.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7013.8017578125, 'completions/min_terminated_length': 1280.0, 'completions/max_terminated_length': 15551.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.24829238653182983, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021335896104574203, 'sampling/sampling_logp_difference/max': 2.3712759017944336, 'sampling/importance_sampling_ratio/min': 0.09336152672767639, 'sampling/importance_sampling_ratio/mean': 1.0000531673431396, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.542723902683065e-05, 'epoch': 0.38}
+
+ 40%|████      | 414/1024 [17:47:04<27:50:14, 164.29s/it][AINFO 12-01 13:18:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:18:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:18:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:18:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 41%|████      | 415/1024 [17:49:43<27:33:22, 162.89s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012181147467345, 'learning_rate': 1e-05, 'num_tokens': 349379458.0, 'completions/mean_length': 6221.78125, 'completions/min_length': 661.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6060.4765625, 'completions/min_terminated_length': 661.0, 'completions/max_terminated_length': 15776.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019453339278697968, 'sampling/sampling_logp_difference/max': 3.056304693222046, 'sampling/importance_sampling_ratio/min': 0.04706127941608429, 'sampling/importance_sampling_ratio/mean': 0.999975323677063, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.222474780770426e-05, 'epoch': 0.38}
+
+ 41%|████      | 415/1024 [17:49:43<27:33:22, 162.89s/it][AINFO 12-01 13:21:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:21:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:21:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:21:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 41%|████      | 416/1024 [17:52:34<27:55:15, 165.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0023128341417759657, 'learning_rate': 1e-05, 'num_tokens': 350430977.0, 'completions/mean_length': 8053.2421875, 'completions/min_length': 907.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7853.30419921875, 'completions/min_terminated_length': 907.0, 'completions/max_terminated_length': 16213.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020572412759065628, 'sampling/sampling_logp_difference/max': 8.203275680541992, 'sampling/importance_sampling_ratio/min': 0.00027375537320040166, 'sampling/importance_sampling_ratio/mean': 1.000013828277588, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5939330334476836e-05, 'epoch': 0.38}
+
+ 41%|████      | 416/1024 [17:52:34<27:55:15, 165.32s/it][AINFO 12-01 13:23:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:23:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:23:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:23:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 417/1024 [17:55:21<27:55:47, 165.65s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.0026674706023186445, 'learning_rate': 1e-05, 'num_tokens': 351377935.0, 'completions/mean_length': 7233.234375, 'completions/min_length': 830.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6541.16015625, 'completions/min_terminated_length': 830.0, 'completions/max_terminated_length': 15872.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020142339169979095, 'sampling/sampling_logp_difference/max': 2.0965206623077393, 'sampling/importance_sampling_ratio/min': 0.12288323789834976, 'sampling/importance_sampling_ratio/mean': 0.9999856352806091, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.257652719592443e-05, 'epoch': 0.38}
+
+ 41%|████      | 417/1024 [17:55:21<27:55:47, 165.65s/it][AINFO 12-01 13:26:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:26:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:26:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:26:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 418/1024 [17:58:00<27:34:12, 163.78s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002246022690087557, 'learning_rate': 1e-05, 'num_tokens': 352302991.0, 'completions/mean_length': 7082.0625, 'completions/min_length': 1187.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6703.9345703125, 'completions/min_terminated_length': 1187.0, 'completions/max_terminated_length': 16186.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2393408566713333, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019953187555074692, 'sampling/sampling_logp_difference/max': 3.834923267364502, 'sampling/importance_sampling_ratio/min': 0.021602995693683624, 'sampling/importance_sampling_ratio/mean': 1.0000483989715576, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.077852554473793e-05, 'epoch': 0.38}
+
+ 41%|████      | 418/1024 [17:58:00<27:34:12, 163.78s/it][AINFO 12-01 13:29:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:29:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:29:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:29:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████      | 419/1024 [18:00:43<27:26:45, 163.31s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022067681420594454, 'learning_rate': 1e-05, 'num_tokens': 353222066.0, 'completions/mean_length': 7030.8359375, 'completions/min_length': 908.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6882.37353515625, 'completions/min_terminated_length': 908.0, 'completions/max_terminated_length': 16088.0, 'rewards/accuracy_reward/mean': 0.6171875, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.6171875, 'reward_std': 0.3669157028198242, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018212012946605682, 'sampling/sampling_logp_difference/max': 3.245823383331299, 'sampling/importance_sampling_ratio/min': 0.038936492055654526, 'sampling/importance_sampling_ratio/mean': 1.00001859664917, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.097331674936868e-05, 'epoch': 0.39}
+
+ 41%|████      | 419/1024 [18:00:43<27:26:45, 163.31s/it][AINFO 12-01 13:31:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:31:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:31:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:31:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 41%|████      | 420/1024 [18:03:10<26:36:14, 158.57s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001403068657964468, 'learning_rate': 1e-05, 'num_tokens': 354057486.0, 'completions/mean_length': 6360.90625, 'completions/min_length': 773.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6281.984375, 'completions/min_terminated_length': 773.0, 'completions/max_terminated_length': 14441.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019863341003656387, 'sampling/sampling_logp_difference/max': 2.1021456718444824, 'sampling/importance_sampling_ratio/min': 0.12219396233558655, 'sampling/importance_sampling_ratio/mean': 0.9999563097953796, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.603046727424953e-05, 'epoch': 0.39}
+
+ 41%|████      | 420/1024 [18:03:10<26:36:14, 158.57s/it][AINFO 12-01 13:34:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:34:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:34:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:34:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 41%|████      | 421/1024 [18:06:05<27:22:23, 163.42s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002135329879820347, 'learning_rate': 1e-05, 'num_tokens': 354962842.0, 'completions/mean_length': 6935.53125, 'completions/min_length': 1287.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6470.85205078125, 'completions/min_terminated_length': 1287.0, 'completions/max_terminated_length': 16127.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017958028241991997, 'sampling/sampling_logp_difference/max': 3.0471086502075195, 'sampling/importance_sampling_ratio/min': 0.04749605432152748, 'sampling/importance_sampling_ratio/mean': 0.9998914003372192, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.928459020447917e-05, 'epoch': 0.39}
+
+ 41%|████      | 421/1024 [18:06:05<27:22:23, 163.42s/it][AINFO 12-01 13:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:37:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:37:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 41%|████      | 422/1024 [18:08:49<27:22:59, 163.75s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001917378744110465, 'learning_rate': 1e-05, 'num_tokens': 356036295.0, 'completions/mean_length': 8250.6640625, 'completions/min_length': 383.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8121.56396484375, 'completions/min_terminated_length': 383.0, 'completions/max_terminated_length': 15818.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020758051425218582, 'sampling/sampling_logp_difference/max': 3.263719081878662, 'sampling/importance_sampling_ratio/min': 0.03824589401483536, 'sampling/importance_sampling_ratio/mean': 0.9999714493751526, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.743662546185078e-05, 'epoch': 0.39}
+
+ 41%|████      | 422/1024 [18:08:49<27:22:59, 163.75s/it][AINFO 12-01 13:40:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:40:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:40:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:40:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 41%|████▏     | 423/1024 [18:11:38<27:34:44, 165.20s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002147022169083357, 'learning_rate': 1e-05, 'num_tokens': 356968012.0, 'completions/mean_length': 7117.6640625, 'completions/min_length': 905.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7044.70068359375, 'completions/min_terminated_length': 905.0, 'completions/max_terminated_length': 15847.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2130674123764038, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020070120692253113, 'sampling/sampling_logp_difference/max': 2.990800142288208, 'sampling/importance_sampling_ratio/min': 0.05024721845984459, 'sampling/importance_sampling_ratio/mean': 1.0000369548797607, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.6784634542309504e-05, 'epoch': 0.39}
+
+ 41%|████▏     | 423/1024 [18:11:38<27:34:44, 165.20s/it][AINFO 12-01 13:42:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:42:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:42:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:42:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 41%|████▏     | 424/1024 [18:14:09<26:49:19, 160.93s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0022378452122211456, 'learning_rate': 1e-05, 'num_tokens': 357772454.0, 'completions/mean_length': 6151.578125, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6071.0078125, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2893138825893402, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020592208951711655, 'sampling/sampling_logp_difference/max': 3.729264736175537, 'sampling/importance_sampling_ratio/min': 0.02401048317551613, 'sampling/importance_sampling_ratio/mean': 1.0000063180923462, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.26790745677863e-05, 'epoch': 0.39}
+
+ 41%|████▏     | 424/1024 [18:14:09<26:49:19, 160.93s/it][AINFO 12-01 13:45:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:45:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:45:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:45:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 425/1024 [18:16:42<26:24:18, 158.69s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0028859705198556185, 'learning_rate': 1e-05, 'num_tokens': 358640232.0, 'completions/mean_length': 6629.703125, 'completions/min_length': 1186.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6552.8974609375, 'completions/min_terminated_length': 1186.0, 'completions/max_terminated_length': 16318.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.28011518716812134, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01827368512749672, 'sampling/sampling_logp_difference/max': 9.278707504272461, 'sampling/importance_sampling_ratio/min': 9.33917544898577e-05, 'sampling/importance_sampling_ratio/mean': 0.9999707937240601, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.315070611937699e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 425/1024 [18:16:42<26:24:18, 158.69s/it][AINFO 12-01 13:47:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:47:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:47:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:47:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 426/1024 [18:19:16<26:07:29, 157.27s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001861788798123598, 'learning_rate': 1e-05, 'num_tokens': 359515134.0, 'completions/mean_length': 6616.859375, 'completions/min_length': 1199.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6539.95263671875, 'completions/min_terminated_length': 1199.0, 'completions/max_terminated_length': 15634.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.13781969249248505, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.020854000002145767, 'sampling/sampling_logp_difference/max': 3.4720840454101562, 'sampling/importance_sampling_ratio/min': 0.031052248552441597, 'sampling/importance_sampling_ratio/mean': 0.999949038028717, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.7924318146688165e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 426/1024 [18:19:16<26:07:29, 157.27s/it][AINFO 12-01 13:50:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:50:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:50:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:50:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 42%|████▏     | 427/1024 [18:21:40<25:24:40, 153.23s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015603898791596293, 'learning_rate': 1e-05, 'num_tokens': 360405056.0, 'completions/mean_length': 6807.265625, 'completions/min_length': 1667.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6731.8583984375, 'completions/min_terminated_length': 1667.0, 'completions/max_terminated_length': 14327.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2709311842918396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0193399079144001, 'sampling/sampling_logp_difference/max': 7.226063251495361, 'sampling/importance_sampling_ratio/min': 0.0007273787632584572, 'sampling/importance_sampling_ratio/mean': 0.9999979734420776, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.787362868479249e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 427/1024 [18:21:40<25:24:40, 153.23s/it][AINFO 12-01 13:52:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:52:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:52:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:52:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 428/1024 [18:23:59<24:40:26, 149.04s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002116540214046836, 'learning_rate': 1e-05, 'num_tokens': 361244483.0, 'completions/mean_length': 6416.8359375, 'completions/min_length': 1297.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6338.3544921875, 'completions/min_terminated_length': 1297.0, 'completions/max_terminated_length': 15536.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.4100441336631775, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.01914902776479721, 'sampling/sampling_logp_difference/max': 1.5276240110397339, 'sampling/importance_sampling_ratio/min': 0.21705077588558197, 'sampling/importance_sampling_ratio/mean': 1.0000052452087402, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.253474018398265e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 428/1024 [18:23:59<24:40:26, 149.04s/it][AINFO 12-01 13:55:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:55:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:55:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:55:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 429/1024 [18:27:02<26:18:46, 159.20s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0008747109095565975, 'learning_rate': 1e-05, 'num_tokens': 362212896.0, 'completions/mean_length': 7400.4140625, 'completions/min_length': 885.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6958.59814453125, 'completions/min_terminated_length': 885.0, 'completions/max_terminated_length': 15875.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.20069600641727448, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021288594231009483, 'sampling/sampling_logp_difference/max': 1.6364035606384277, 'sampling/importance_sampling_ratio/min': 0.27797213196754456, 'sampling/importance_sampling_ratio/mean': 1.0000046491622925, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.817547619495599e-05, 'epoch': 0.39}
+
+ 42%|████▏     | 429/1024 [18:27:02<26:18:46, 159.20s/it][AINFO 12-01 13:58:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:58:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:58:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 13:58:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 430/1024 [18:29:52<26:46:52, 162.31s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002167422091588378, 'learning_rate': 1e-05, 'num_tokens': 363134222.0, 'completions/mean_length': 7061.546875, 'completions/min_length': 792.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6760.822265625, 'completions/min_terminated_length': 792.0, 'completions/max_terminated_length': 15874.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.37716543674468994, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019034095108509064, 'sampling/sampling_logp_difference/max': 2.113288164138794, 'sampling/importance_sampling_ratio/min': 0.12083996832370758, 'sampling/importance_sampling_ratio/mean': 1.0000550746917725, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.574818437205977e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 430/1024 [18:29:52<26:46:52, 162.31s/it][AINFO 12-01 14:01:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:01:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:01:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:01:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 431/1024 [18:32:24<26:15:34, 159.42s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018200442427769303, 'learning_rate': 1e-05, 'num_tokens': 363962175.0, 'completions/mean_length': 6303.6953125, 'completions/min_length': 830.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6143.69091796875, 'completions/min_terminated_length': 830.0, 'completions/max_terminated_length': 14126.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.2314501404762268, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018610354512929916, 'sampling/sampling_logp_difference/max': 2.4252538681030273, 'sampling/importance_sampling_ratio/min': 0.08845566213130951, 'sampling/importance_sampling_ratio/mean': 1.0000460147857666, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.22908958626067e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 431/1024 [18:32:24<26:15:34, 159.42s/it][AINFO 12-01 14:03:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:03:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:03:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:03:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 432/1024 [18:35:07<26:23:28, 160.49s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002277121413499117, 'learning_rate': 1e-05, 'num_tokens': 364852857.0, 'completions/mean_length': 6785.515625, 'completions/min_length': 660.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6475.88671875, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 16052.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0189058780670166, 'sampling/sampling_logp_difference/max': 4.674638271331787, 'sampling/importance_sampling_ratio/min': 0.009328898973762989, 'sampling/importance_sampling_ratio/mean': 1.000082015991211, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.737248027846363e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 432/1024 [18:35:07<26:23:28, 160.49s/it][AINFO 12-01 14:06:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:06:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:06:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:06:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 433/1024 [18:37:58<26:50:50, 163.54s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014180162688717246, 'learning_rate': 1e-05, 'num_tokens': 365874873.0, 'completions/mean_length': 7822.0625, 'completions/min_length': 1103.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7174.521484375, 'completions/min_terminated_length': 1103.0, 'completions/max_terminated_length': 16255.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019482173025608063, 'sampling/sampling_logp_difference/max': 8.068410873413086, 'sampling/importance_sampling_ratio/min': 0.00031328073237091303, 'sampling/importance_sampling_ratio/mean': 0.9999924898147583, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.606683296515257e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 433/1024 [18:37:58<26:50:50, 163.54s/it][AINFO 12-01 14:09:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:09:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:09:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:09:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 434/1024 [18:40:26<26:02:37, 158.91s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018428050680086017, 'learning_rate': 1e-05, 'num_tokens': 366659012.0, 'completions/mean_length': 5985.5234375, 'completions/min_length': 865.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5820.46875, 'completions/min_terminated_length': 865.0, 'completions/max_terminated_length': 16133.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.22461043298244476, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018497779965400696, 'sampling/sampling_logp_difference/max': 6.951936721801758, 'sampling/importance_sampling_ratio/min': 0.0009567803354002535, 'sampling/importance_sampling_ratio/mean': 0.9999692440032959, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.390095693906915e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 434/1024 [18:40:26<26:02:37, 158.91s/it][AINFO 12-01 14:11:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:11:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:11:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:11:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 42%|████▏     | 435/1024 [18:43:05<25:59:37, 158.87s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0013368086656555533, 'learning_rate': 1e-05, 'num_tokens': 367417482.0, 'completions/mean_length': 5759.234375, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5590.58740234375, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 14006.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.65625, 'reward_std': 0.2432974874973297, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017376579344272614, 'sampling/sampling_logp_difference/max': 3.4259679317474365, 'sampling/importance_sampling_ratio/min': 0.03251779079437256, 'sampling/importance_sampling_ratio/mean': 0.9999662041664124, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1506207910279045e-05, 'epoch': 0.4}
+
+ 42%|████▏     | 435/1024 [18:43:05<25:59:37, 158.87s/it][AINFO 12-01 14:14:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:14:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:14:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:14:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 436/1024 [18:45:47<26:06:36, 159.86s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001232087961398065, 'learning_rate': 1e-05, 'num_tokens': 368378221.0, 'completions/mean_length': 7341.2734375, 'completions/min_length': 1055.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7197.73828125, 'completions/min_terminated_length': 1055.0, 'completions/max_terminated_length': 16290.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.32507073879241943, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020317785441875458, 'sampling/sampling_logp_difference/max': 3.438768148422241, 'sampling/importance_sampling_ratio/min': 0.03210420906543732, 'sampling/importance_sampling_ratio/mean': 0.9999468922615051, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.675674290410825e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 436/1024 [18:45:47<26:06:36, 159.86s/it][AINFO 12-01 14:17:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:17:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:17:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:17:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 437/1024 [18:48:32<26:17:47, 161.27s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013168298173695803, 'learning_rate': 1e-05, 'num_tokens': 369281741.0, 'completions/mean_length': 6894.5625, 'completions/min_length': 1213.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6819.84228515625, 'completions/min_terminated_length': 1213.0, 'completions/max_terminated_length': 16201.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.26143795251846313, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020312929525971413, 'sampling/sampling_logp_difference/max': 2.2337045669555664, 'sampling/importance_sampling_ratio/min': 0.10713081806898117, 'sampling/importance_sampling_ratio/mean': 0.9999969005584717, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5359715639060596e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 437/1024 [18:48:32<26:17:47, 161.27s/it][AINFO 12-01 14:19:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:19:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:19:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:19:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 438/1024 [18:51:16<26:24:45, 162.26s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018373837228864431, 'learning_rate': 1e-05, 'num_tokens': 370243647.0, 'completions/mean_length': 7378.140625, 'completions/min_length': 1188.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7235.19091796875, 'completions/min_terminated_length': 1188.0, 'completions/max_terminated_length': 14123.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2914257347583771, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02144196815788746, 'sampling/sampling_logp_difference/max': 4.713451862335205, 'sampling/importance_sampling_ratio/min': 0.008973748423159122, 'sampling/importance_sampling_ratio/mean': 1.00002121925354, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.29091102584789e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 438/1024 [18:51:16<26:24:45, 162.26s/it][AINFO 12-01 14:22:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:22:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:22:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:22:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 439/1024 [18:53:41<25:30:30, 156.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002317083068192005, 'learning_rate': 1e-05, 'num_tokens': 371118647.0, 'completions/mean_length': 6665.625, 'completions/min_length': 1210.0, 'completions/max_length': 15817.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6665.625, 'completions/min_terminated_length': 1210.0, 'completions/max_terminated_length': 15817.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.34116142988204956, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01913169026374817, 'sampling/sampling_logp_difference/max': 1.929722785949707, 'sampling/importance_sampling_ratio/min': 0.14518843591213226, 'sampling/importance_sampling_ratio/mean': 1.0000214576721191, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.627980024451972e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 439/1024 [18:53:41<25:30:30, 156.98s/it][AINFO 12-01 14:24:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:24:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:24:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:24:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 440/1024 [18:56:17<25:24:34, 156.63s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010503862285986543, 'learning_rate': 1e-05, 'num_tokens': 371977510.0, 'completions/mean_length': 6570.6171875, 'completions/min_length': 694.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6171.69873046875, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 15941.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.18990950286388397, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02102888748049736, 'sampling/sampling_logp_difference/max': 1.9050054550170898, 'sampling/importance_sampling_ratio/min': 0.14882183074951172, 'sampling/importance_sampling_ratio/mean': 1.000065565109253, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.068988475111837e-05, 'epoch': 0.4}
+
+ 43%|████▎     | 440/1024 [18:56:17<25:24:34, 156.63s/it][AINFO 12-01 14:27:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:27:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:27:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:27:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 441/1024 [18:58:59<25:38:19, 158.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014759977348148823, 'learning_rate': 1e-05, 'num_tokens': 372866285.0, 'completions/mean_length': 6766.7421875, 'completions/min_length': 1478.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6535.92822265625, 'completions/min_terminated_length': 1478.0, 'completions/max_terminated_length': 15725.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.27434611320495605, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020376287400722504, 'sampling/sampling_logp_difference/max': 1.3152861595153809, 'sampling/importance_sampling_ratio/min': 0.27071043848991394, 'sampling/importance_sampling_ratio/mean': 1.0000669956207275, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.016077795971796e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 441/1024 [18:58:59<25:38:19, 158.32s/it][AINFO 12-01 14:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:30:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:30:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 442/1024 [19:01:59<26:39:04, 164.85s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022281028795987368, 'learning_rate': 1e-05, 'num_tokens': 373870468.0, 'completions/mean_length': 7706.2421875, 'completions/min_length': 683.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7204.22265625, 'completions/min_terminated_length': 683.0, 'completions/max_terminated_length': 15269.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.21040897071361542, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021154627203941345, 'sampling/sampling_logp_difference/max': 6.1498236656188965, 'sampling/importance_sampling_ratio/min': 0.002133857924491167, 'sampling/importance_sampling_ratio/mean': 1.0000420808792114, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.3121784806789947e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 442/1024 [19:01:59<26:39:04, 164.85s/it][AINFO 12-01 14:33:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:33:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:33:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:33:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 443/1024 [19:04:35<26:11:17, 162.27s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012459541903808713, 'learning_rate': 1e-05, 'num_tokens': 374873840.0, 'completions/mean_length': 7653.46875, 'completions/min_length': 1188.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7371.83837890625, 'completions/min_terminated_length': 1188.0, 'completions/max_terminated_length': 15544.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3135277330875397, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020770955830812454, 'sampling/sampling_logp_difference/max': 1.5551583766937256, 'sampling/importance_sampling_ratio/min': 0.2111559361219406, 'sampling/importance_sampling_ratio/mean': 0.9999937415122986, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.557116458021483e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 443/1024 [19:04:35<26:11:17, 162.27s/it][AINFO 12-01 14:35:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:35:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:35:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:35:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 444/1024 [19:07:09<25:44:54, 159.82s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0008881304529495537, 'learning_rate': 1e-05, 'num_tokens': 375841348.0, 'completions/mean_length': 7386.46875, 'completions/min_length': 948.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7315.6220703125, 'completions/min_terminated_length': 948.0, 'completions/max_terminated_length': 15266.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.24489018321037292, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02098047360777855, 'sampling/sampling_logp_difference/max': 2.2442626953125, 'sampling/importance_sampling_ratio/min': 0.10600567609071732, 'sampling/importance_sampling_ratio/mean': 0.9999724626541138, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.679034848071751e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 444/1024 [19:07:09<25:44:54, 159.82s/it][AINFO 12-01 14:38:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:38:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:38:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:38:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 43%|████▎     | 445/1024 [19:10:00<26:13:53, 163.10s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017967434832826257, 'learning_rate': 1e-05, 'num_tokens': 376838961.0, 'completions/mean_length': 7627.1015625, 'completions/min_length': 1347.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7344.62060546875, 'completions/min_terminated_length': 1347.0, 'completions/max_terminated_length': 16093.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.21778056025505066, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01877785287797451, 'sampling/sampling_logp_difference/max': 3.395616054534912, 'sampling/importance_sampling_ratio/min': 0.03351989760994911, 'sampling/importance_sampling_ratio/mean': 0.99991375207901, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.519234880717704e-05, 'epoch': 0.41}
+
+ 43%|████▎     | 445/1024 [19:10:00<26:13:53, 163.10s/it][AINFO 12-01 14:41:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:41:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:41:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:41:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▎     | 446/1024 [19:12:21<25:07:18, 156.47s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002020155545324087, 'learning_rate': 1e-05, 'num_tokens': 377572345.0, 'completions/mean_length': 5526.5, 'completions/min_length': 730.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5441.0078125, 'completions/min_terminated_length': 730.0, 'completions/max_terminated_length': 15799.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017539039254188538, 'sampling/sampling_logp_difference/max': 3.41158127784729, 'sampling/importance_sampling_ratio/min': 0.03298899531364441, 'sampling/importance_sampling_ratio/mean': 0.9999401569366455, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7019173848639184e-05, 'epoch': 0.41}
+
+ 44%|████▎     | 446/1024 [19:12:21<25:07:18, 156.47s/it][AINFO 12-01 14:43:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:43:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:43:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:43:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▎     | 447/1024 [19:15:05<25:25:50, 158.67s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016213897615671158, 'learning_rate': 1e-05, 'num_tokens': 378564234.0, 'completions/mean_length': 7609.3828125, 'completions/min_length': 1228.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6945.7568359375, 'completions/min_terminated_length': 1228.0, 'completions/max_terminated_length': 14950.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0205511674284935, 'sampling/sampling_logp_difference/max': 2.873350143432617, 'sampling/importance_sampling_ratio/min': 0.05650929734110832, 'sampling/importance_sampling_ratio/mean': 0.9999391436576843, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.683286321480409e-05, 'epoch': 0.41}
+
+ 44%|████▎     | 447/1024 [19:15:05<25:25:50, 158.67s/it][AINFO 12-01 14:46:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:46:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:46:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:46:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 448/1024 [19:17:47<25:32:36, 159.65s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001374174957163632, 'learning_rate': 1e-05, 'num_tokens': 379468016.0, 'completions/mean_length': 6889.859375, 'completions/min_length': 878.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6503.91845703125, 'completions/min_terminated_length': 878.0, 'completions/max_terminated_length': 16015.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.27564430236816406, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019248928874731064, 'sampling/sampling_logp_difference/max': 2.707213878631592, 'sampling/importance_sampling_ratio/min': 0.16159364581108093, 'sampling/importance_sampling_ratio/mean': 0.999980092048645, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.853872269450221e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 448/1024 [19:17:47<25:32:36, 159.65s/it][AINFO 12-01 14:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:49:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:49:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 449/1024 [19:20:51<26:39:39, 166.92s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001686620176769793, 'learning_rate': 1e-05, 'num_tokens': 380402274.0, 'completions/mean_length': 7128.203125, 'completions/min_length': 1192.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6511.150390625, 'completions/min_terminated_length': 1192.0, 'completions/max_terminated_length': 16102.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017203927040100098, 'sampling/sampling_logp_difference/max': 2.8486881256103516, 'sampling/importance_sampling_ratio/min': 0.05792025476694107, 'sampling/importance_sampling_ratio/mean': 1.0000317096710205, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.235420434246407e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 449/1024 [19:20:51<26:39:39, 166.92s/it][AINFO 12-01 14:52:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:52:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:52:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:52:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 44%|████▍     | 450/1024 [19:23:25<26:00:31, 163.12s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002831409452483058, 'learning_rate': 1e-05, 'num_tokens': 381289913.0, 'completions/mean_length': 6796.7421875, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6644.56396484375, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 15406.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.26249873638153076, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02045862004160881, 'sampling/sampling_logp_difference/max': 2.5472664833068848, 'sampling/importance_sampling_ratio/min': 0.07829539477825165, 'sampling/importance_sampling_ratio/mean': 0.9998919367790222, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.788729529536795e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 450/1024 [19:23:25<26:00:31, 163.12s/it][AINFO 12-01 14:54:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:54:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:54:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:54:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 451/1024 [19:26:03<25:41:22, 161.40s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002012337325140834, 'learning_rate': 1e-05, 'num_tokens': 382151188.0, 'completions/mean_length': 6564.4609375, 'completions/min_length': 801.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6487.1416015625, 'completions/min_terminated_length': 801.0, 'completions/max_terminated_length': 16330.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02115769311785698, 'sampling/sampling_logp_difference/max': 1.3232743740081787, 'sampling/importance_sampling_ratio/min': 0.27212387323379517, 'sampling/importance_sampling_ratio/mean': 1.0000202655792236, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.9321339891575917e-05, 'epoch': 0.41}
+
+ 44%|████▍     | 451/1024 [19:26:03<25:41:22, 161.40s/it][AINFO 12-01 14:57:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:57:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:57:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 14:57:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 452/1024 [19:29:17<27:14:19, 171.43s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015543104382231832, 'learning_rate': 1e-05, 'num_tokens': 383246847.0, 'completions/mean_length': 8386.8359375, 'completions/min_length': 1035.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8061.74755859375, 'completions/min_terminated_length': 1035.0, 'completions/max_terminated_length': 16306.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.23410364985466003, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020390262827277184, 'sampling/sampling_logp_difference/max': 2.798407554626465, 'sampling/importance_sampling_ratio/min': 0.06090697646141052, 'sampling/importance_sampling_ratio/mean': 0.999999463558197, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.723450754478108e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 452/1024 [19:29:17<27:14:19, 171.43s/it][AINFO 12-01 15:00:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:00:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:00:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:00:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 453/1024 [19:32:01<26:48:59, 169.07s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0021093247924000025, 'learning_rate': 1e-05, 'num_tokens': 384159822.0, 'completions/mean_length': 6987.3671875, 'completions/min_length': 773.0, 'completions/max_length': 16122.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6987.3671875, 'completions/min_terminated_length': 773.0, 'completions/max_terminated_length': 16122.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2120065987110138, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02046578750014305, 'sampling/sampling_logp_difference/max': 7.64358377456665, 'sampling/importance_sampling_ratio/min': 0.00047910833382047713, 'sampling/importance_sampling_ratio/mean': 1.0000823736190796, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.681646424207429e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 453/1024 [19:32:01<26:48:59, 169.07s/it][AINFO 12-01 15:03:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:03:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:03:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:03:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 454/1024 [19:35:03<27:24:07, 173.07s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018901432631537318, 'learning_rate': 1e-05, 'num_tokens': 385354954.0, 'completions/mean_length': 9174.90625, 'completions/min_length': 1516.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8820.3603515625, 'completions/min_terminated_length': 1516.0, 'completions/max_terminated_length': 16212.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020614182576537132, 'sampling/sampling_logp_difference/max': 1.7311882972717285, 'sampling/importance_sampling_ratio/min': 0.1770738661289215, 'sampling/importance_sampling_ratio/mean': 1.000016689300537, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2532736390276114e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 454/1024 [19:35:03<27:24:07, 173.07s/it][AINFO 12-01 15:06:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:06:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:06:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:06:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 44%|████▍     | 455/1024 [19:37:49<26:59:23, 170.76s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012458597775548697, 'learning_rate': 1e-05, 'num_tokens': 386321976.0, 'completions/mean_length': 7411.484375, 'completions/min_length': 984.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7269.06396484375, 'completions/min_terminated_length': 984.0, 'completions/max_terminated_length': 15966.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2977413833141327, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020720863714814186, 'sampling/sampling_logp_difference/max': 2.227597713470459, 'sampling/importance_sampling_ratio/min': 0.10778705775737762, 'sampling/importance_sampling_ratio/mean': 1.0000202655792236, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.512836198955483e-05, 'epoch': 0.42}
+
+ 44%|████▍     | 455/1024 [19:37:49<26:59:23, 170.76s/it][AINFO 12-01 15:09:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:09:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:09:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:09:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▍     | 456/1024 [19:40:20<26:01:48, 164.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002139081247150898, 'learning_rate': 1e-05, 'num_tokens': 387218339.0, 'completions/mean_length': 6841.3359375, 'completions/min_length': 623.0, 'completions/max_length': 16277.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6841.3359375, 'completions/min_terminated_length': 623.0, 'completions/max_terminated_length': 16277.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2527858018875122, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020805353298783302, 'sampling/sampling_logp_difference/max': 7.749989032745361, 'sampling/importance_sampling_ratio/min': 0.0004307472554501146, 'sampling/importance_sampling_ratio/mean': 1.0000125169754028, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5855065334544634e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 456/1024 [19:40:20<26:01:48, 164.98s/it][AINFO 12-01 15:11:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:11:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:11:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:11:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▍     | 457/1024 [19:43:50<28:06:39, 178.48s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0009118648595176637, 'learning_rate': 1e-05, 'num_tokens': 388243430.0, 'completions/mean_length': 7853.7734375, 'completions/min_length': 1475.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7507.01611328125, 'completions/min_terminated_length': 1475.0, 'completions/max_terminated_length': 16265.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02049083076417446, 'sampling/sampling_logp_difference/max': 2.6902785301208496, 'sampling/importance_sampling_ratio/min': 0.06786203384399414, 'sampling/importance_sampling_ratio/mean': 0.9999375343322754, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.357344914751593e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 457/1024 [19:43:50<28:06:39, 178.48s/it][AINFO 12-01 15:15:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:15:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:15:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:15:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▍     | 458/1024 [19:46:47<27:58:42, 177.95s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001902865362353623, 'learning_rate': 1e-05, 'num_tokens': 389203906.0, 'completions/mean_length': 7308.84375, 'completions/min_length': 1279.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6703.83349609375, 'completions/min_terminated_length': 1279.0, 'completions/max_terminated_length': 15966.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.23251095414161682, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02143084444105625, 'sampling/sampling_logp_difference/max': 7.741305351257324, 'sampling/importance_sampling_ratio/min': 0.00043450400698930025, 'sampling/importance_sampling_ratio/mean': 1.0000170469284058, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5463275910151424e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 458/1024 [19:46:47<27:58:42, 177.95s/it][AINFO 12-01 15:18:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:18:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:18:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:18:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▍     | 459/1024 [19:49:31<27:15:42, 173.70s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009721667156554759, 'learning_rate': 1e-05, 'num_tokens': 390181343.0, 'completions/mean_length': 7485.1015625, 'completions/min_length': 1044.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7198.0400390625, 'completions/min_terminated_length': 1044.0, 'completions/max_terminated_length': 16335.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.24671243131160736, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019911974668502808, 'sampling/sampling_logp_difference/max': 2.153526782989502, 'sampling/importance_sampling_ratio/min': 0.13441969454288483, 'sampling/importance_sampling_ratio/mean': 0.9999589323997498, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5365305595623795e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 459/1024 [19:49:31<27:15:42, 173.70s/it][AINFO 12-01 15:20:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:20:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:20:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:20:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▍     | 460/1024 [19:52:18<26:54:30, 171.76s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0026427023112773895, 'learning_rate': 1e-05, 'num_tokens': 391121306.0, 'completions/mean_length': 7199.2734375, 'completions/min_length': 702.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6978.84033203125, 'completions/min_terminated_length': 702.0, 'completions/max_terminated_length': 16049.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3826971650123596, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018386254087090492, 'sampling/sampling_logp_difference/max': 3.199183464050293, 'sampling/importance_sampling_ratio/min': 0.040795501321554184, 'sampling/importance_sampling_ratio/mean': 0.9999700784683228, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.119960744399577e-05, 'epoch': 0.42}
+
+ 45%|████▍     | 460/1024 [19:52:18<26:54:30, 171.76s/it][AINFO 12-01 15:23:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:23:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:23:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:23:35 [block_pool.py:292] Successfully reset prefix cache
+
+ 45%|████▌     | 461/1024 [19:54:56<26:14:05, 167.75s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.00169390719383955, 'learning_rate': 1e-05, 'num_tokens': 392037602.0, 'completions/mean_length': 7004.3125, 'completions/min_length': 1321.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6930.45654296875, 'completions/min_terminated_length': 1321.0, 'completions/max_terminated_length': 15706.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.19226360321044922, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019455933943390846, 'sampling/sampling_logp_difference/max': 2.678997278213501, 'sampling/importance_sampling_ratio/min': 0.06863193213939667, 'sampling/importance_sampling_ratio/mean': 1.0000141859054565, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6296820755742374e-05, 'epoch': 0.42}
+
+ 45%|████▌     | 461/1024 [19:54:56<26:14:05, 167.75s/it][AINFO 12-01 15:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:26:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:26:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▌     | 462/1024 [19:57:36<25:49:40, 165.45s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001759183476679027, 'learning_rate': 1e-05, 'num_tokens': 393076104.0, 'completions/mean_length': 7893.171875, 'completions/min_length': 581.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7559.72314453125, 'completions/min_terminated_length': 581.0, 'completions/max_terminated_length': 15998.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.24329257011413574, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020976314321160316, 'sampling/sampling_logp_difference/max': 2.944356918334961, 'sampling/importance_sampling_ratio/min': 0.05263589695096016, 'sampling/importance_sampling_ratio/mean': 1.0000724792480469, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.764566526522685e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 462/1024 [19:57:36<25:49:40, 165.45s/it][AINFO 12-01 15:28:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:28:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:28:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:28:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▌     | 463/1024 [20:00:17<25:33:58, 164.06s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001272660563699901, 'learning_rate': 1e-05, 'num_tokens': 394030528.0, 'completions/mean_length': 7320.375, 'completions/min_length': 1087.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7176.50830078125, 'completions/min_terminated_length': 1087.0, 'completions/max_terminated_length': 16004.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2869548797607422, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0187496617436409, 'sampling/sampling_logp_difference/max': 5.959049701690674, 'sampling/importance_sampling_ratio/min': 0.0025823647156357765, 'sampling/importance_sampling_ratio/mean': 0.9999886155128479, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.760404524655314e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 463/1024 [20:00:17<25:33:58, 164.06s/it][AINFO 12-01 15:31:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:31:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:31:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:31:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▌     | 464/1024 [20:02:52<25:05:03, 161.26s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0027106585912406445, 'learning_rate': 1e-05, 'num_tokens': 394888588.0, 'completions/mean_length': 6548.21875, 'completions/min_length': 875.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6470.771484375, 'completions/min_terminated_length': 875.0, 'completions/max_terminated_length': 16307.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.22621294856071472, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020794736221432686, 'sampling/sampling_logp_difference/max': 2.0370748043060303, 'sampling/importance_sampling_ratio/min': 0.1304096281528473, 'sampling/importance_sampling_ratio/mean': 0.9999959468841553, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.526040584096336e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 464/1024 [20:02:52<25:05:03, 161.26s/it][AINFO 12-01 15:34:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:34:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:34:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:34:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 45%|████▌     | 465/1024 [20:05:34<25:03:57, 161.43s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001480677630752325, 'learning_rate': 1e-05, 'num_tokens': 395783506.0, 'completions/mean_length': 6849.296875, 'completions/min_length': 790.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6774.22021484375, 'completions/min_terminated_length': 790.0, 'completions/max_terminated_length': 16139.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.24830514192581177, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021268365904688835, 'sampling/sampling_logp_difference/max': 3.610440731048584, 'sampling/importance_sampling_ratio/min': 0.02703992836177349, 'sampling/importance_sampling_ratio/mean': 0.9999355673789978, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.350268631649669e-05, 'epoch': 0.43}
+
+ 45%|████▌     | 465/1024 [20:05:34<25:03:57, 161.43s/it][AINFO 12-01 15:36:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:36:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:36:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:36:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▌     | 466/1024 [20:08:05<24:34:10, 158.51s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019248281605541706, 'learning_rate': 1e-05, 'num_tokens': 396640213.0, 'completions/mean_length': 6539.1484375, 'completions/min_length': 826.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6221.572265625, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 15011.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01918519102036953, 'sampling/sampling_logp_difference/max': 3.225762367248535, 'sampling/importance_sampling_ratio/min': 0.03972548246383667, 'sampling/importance_sampling_ratio/mean': 1.0000699758529663, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.066232372679224e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 466/1024 [20:08:05<24:34:10, 158.51s/it][AINFO 12-01 15:39:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:39:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:39:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:39:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▌     | 467/1024 [20:11:23<26:21:12, 170.33s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0025619270745664835, 'learning_rate': 1e-05, 'num_tokens': 397820526.0, 'completions/mean_length': 9057.6328125, 'completions/min_length': 1609.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8821.2978515625, 'completions/min_terminated_length': 1609.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02239348366856575, 'sampling/sampling_logp_difference/max': 3.008070468902588, 'sampling/importance_sampling_ratio/min': 0.0493868812918663, 'sampling/importance_sampling_ratio/mean': 1.0000107288360596, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.741925431517302e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 467/1024 [20:11:23<26:21:12, 170.33s/it][AINFO 12-01 15:42:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:42:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:42:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:42:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▌     | 468/1024 [20:14:13<26:16:29, 170.12s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001929386518895626, 'learning_rate': 1e-05, 'num_tokens': 398799814.0, 'completions/mean_length': 7513.3125, 'completions/min_length': 1101.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7372.50830078125, 'completions/min_terminated_length': 1101.0, 'completions/max_terminated_length': 15748.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2869548797607422, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021764786913990974, 'sampling/sampling_logp_difference/max': 2.092804431915283, 'sampling/importance_sampling_ratio/min': 0.12334074825048447, 'sampling/importance_sampling_ratio/mean': 0.9999610185623169, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3205805670586415e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 468/1024 [20:14:13<26:16:29, 170.12s/it][AINFO 12-01 15:45:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:45:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:45:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:45:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▌     | 469/1024 [20:17:01<26:08:59, 169.62s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018022956792265177, 'learning_rate': 1e-05, 'num_tokens': 399799165.0, 'completions/mean_length': 7637.8046875, 'completions/min_length': 835.0, 'completions/max_length': 16203.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7637.8046875, 'completions/min_terminated_length': 835.0, 'completions/max_terminated_length': 16203.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.26538968086242676, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02041994407773018, 'sampling/sampling_logp_difference/max': 19.95710563659668, 'sampling/importance_sampling_ratio/min': 2.151489209012425e-09, 'sampling/importance_sampling_ratio/mean': 0.9999970197677612, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.460717733920319e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 469/1024 [20:17:01<26:08:59, 169.62s/it][AINFO 12-01 15:48:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:48:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:48:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:48:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▌     | 470/1024 [20:19:31<25:10:12, 163.56s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0017809163546189666, 'learning_rate': 1e-05, 'num_tokens': 400612612.0, 'completions/mean_length': 6211.4921875, 'completions/min_length': 763.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5967.35205078125, 'completions/min_terminated_length': 763.0, 'completions/max_terminated_length': 16339.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02162076160311699, 'sampling/sampling_logp_difference/max': 1.8627716302871704, 'sampling/importance_sampling_ratio/min': 0.15524175763130188, 'sampling/importance_sampling_ratio/mean': 0.9999809861183167, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.707511704462377e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 470/1024 [20:19:31<25:10:12, 163.56s/it][AINFO 12-01 15:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:50:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▌     | 471/1024 [20:22:25<25:35:37, 166.61s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0024844296276569366, 'learning_rate': 1e-05, 'num_tokens': 401646216.0, 'completions/mean_length': 7925.71875, 'completions/min_length': 1118.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7581.8857421875, 'completions/min_terminated_length': 1118.0, 'completions/max_terminated_length': 15937.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2109457403421402, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021158821880817413, 'sampling/sampling_logp_difference/max': 2.693974018096924, 'sampling/importance_sampling_ratio/min': 0.06761171668767929, 'sampling/importance_sampling_ratio/mean': 0.9999508857727051, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.9433191918524244e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 471/1024 [20:22:25<25:35:37, 166.61s/it][AINFO 12-01 15:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:53:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▌     | 472/1024 [20:25:08<25:24:44, 165.73s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0032119930256158113, 'learning_rate': 1e-05, 'num_tokens': 402505882.0, 'completions/mean_length': 6544.515625, 'completions/min_length': 1015.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5975.2890625, 'completions/min_terminated_length': 1015.0, 'completions/max_terminated_length': 16325.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01970825530588627, 'sampling/sampling_logp_difference/max': 3.123436689376831, 'sampling/importance_sampling_ratio/min': 0.04400567337870598, 'sampling/importance_sampling_ratio/mean': 0.9999876022338867, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.387318656677962e-05, 'epoch': 0.43}
+
+ 46%|████▌     | 472/1024 [20:25:08<25:24:44, 165.73s/it][AINFO 12-01 15:56:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:56:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:56:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:56:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▌     | 473/1024 [20:28:07<25:58:34, 169.72s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011805761605501175, 'learning_rate': 1e-05, 'num_tokens': 403402555.0, 'completions/mean_length': 6837.8203125, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6368.3359375, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 16384.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01840839348733425, 'sampling/sampling_logp_difference/max': 2.612398624420166, 'sampling/importance_sampling_ratio/min': 0.07335837185382843, 'sampling/importance_sampling_ratio/mean': 1.000012755393982, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1083995622793736e-05, 'epoch': 0.44}
+
+ 46%|████▌     | 473/1024 [20:28:07<25:58:34, 169.72s/it][AINFO 12-01 15:59:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:59:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:59:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 15:59:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▋     | 474/1024 [20:31:07<26:23:57, 172.80s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0024858349934220314, 'learning_rate': 1e-05, 'num_tokens': 404530017.0, 'completions/mean_length': 8653.546875, 'completions/min_length': 1158.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 8206.330078125, 'completions/min_terminated_length': 1158.0, 'completions/max_terminated_length': 16273.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.33114904165267944, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0210890993475914, 'sampling/sampling_logp_difference/max': 4.668471336364746, 'sampling/importance_sampling_ratio/min': 0.009386607445776463, 'sampling/importance_sampling_ratio/mean': 0.9999847412109375, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.377661929946044e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 474/1024 [20:31:07<26:23:57, 172.80s/it][AINFO 12-01 16:02:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:02:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:02:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:02:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 46%|████▋     | 475/1024 [20:33:38<25:20:11, 166.14s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014396592741832137, 'learning_rate': 1e-05, 'num_tokens': 405448833.0, 'completions/mean_length': 7028.8125, 'completions/min_length': 912.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6955.1494140625, 'completions/min_terminated_length': 912.0, 'completions/max_terminated_length': 16269.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2511882185935974, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022308968007564545, 'sampling/sampling_logp_difference/max': 5.8088836669921875, 'sampling/importance_sampling_ratio/min': 0.0030007781460881233, 'sampling/importance_sampling_ratio/mean': 1.0000425577163696, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.920678468854021e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 475/1024 [20:33:38<25:20:11, 166.14s/it][AINFO 12-01 16:04:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:04:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:04:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:04:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 46%|████▋     | 476/1024 [20:36:15<24:53:31, 163.52s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012526597129181027, 'learning_rate': 1e-05, 'num_tokens': 406417313.0, 'completions/mean_length': 7407.3125, 'completions/min_length': 1192.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7264.82568359375, 'completions/min_terminated_length': 1192.0, 'completions/max_terminated_length': 15550.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.31405961513519287, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021511003375053406, 'sampling/sampling_logp_difference/max': 2.220275402069092, 'sampling/importance_sampling_ratio/min': 0.17289510369300842, 'sampling/importance_sampling_ratio/mean': 0.9999992847442627, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.250412136272644e-05, 'epoch': 0.44}
+
+ 46%|████▋     | 476/1024 [20:36:15<24:53:31, 163.52s/it][AINFO 12-01 16:07:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:07:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:07:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:07:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 47%|████▋     | 477/1024 [20:38:57<24:46:38, 163.07s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002589466283097863, 'learning_rate': 1e-05, 'num_tokens': 407264592.0, 'completions/mean_length': 6465.4296875, 'completions/min_length': 1678.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6387.33056640625, 'completions/min_terminated_length': 1678.0, 'completions/max_terminated_length': 14882.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3713865876197815, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02066504955291748, 'sampling/sampling_logp_difference/max': 4.768423557281494, 'sampling/importance_sampling_ratio/min': 0.008493759669363499, 'sampling/importance_sampling_ratio/mean': 1.0000619888305664, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.772328313469188e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 477/1024 [20:38:57<24:46:38, 163.07s/it][AINFO 12-01 16:10:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:10:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:10:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:10:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 478/1024 [20:41:43<24:51:57, 163.95s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003131282515823841, 'learning_rate': 1e-05, 'num_tokens': 408194484.0, 'completions/mean_length': 7109.28125, 'completions/min_length': 1291.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6962.06396484375, 'completions/min_terminated_length': 1291.0, 'completions/max_terminated_length': 16111.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2025182545185089, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02027060277760029, 'sampling/sampling_logp_difference/max': 1.9558563232421875, 'sampling/importance_sampling_ratio/min': 0.14144329726696014, 'sampling/importance_sampling_ratio/mean': 1.0000600814819336, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.232834555750742e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 478/1024 [20:41:43<24:51:57, 163.95s/it][AINFO 12-01 16:13:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:13:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:13:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:13:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 47%|████▋     | 479/1024 [20:44:32<25:03:29, 165.52s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0023977444507181644, 'learning_rate': 1e-05, 'num_tokens': 409149644.0, 'completions/mean_length': 7319.1875, 'completions/min_length': 1298.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6950.69873046875, 'completions/min_terminated_length': 1298.0, 'completions/max_terminated_length': 15390.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2120065689086914, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02096422016620636, 'sampling/sampling_logp_difference/max': 2.2364842891693115, 'sampling/importance_sampling_ratio/min': 0.10683343559503555, 'sampling/importance_sampling_ratio/mean': 1.000037431716919, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.683238239522325e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 479/1024 [20:44:32<25:03:29, 165.52s/it][AINFO 12-01 16:15:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:15:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:15:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:15:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 480/1024 [20:47:34<25:43:11, 170.20s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0005734060541726649, 'learning_rate': 1e-05, 'num_tokens': 410173123.0, 'completions/mean_length': 7841.9921875, 'completions/min_length': 1186.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7421.89306640625, 'completions/min_terminated_length': 1186.0, 'completions/max_terminated_length': 15947.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.16097761690616608, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020381946116685867, 'sampling/sampling_logp_difference/max': 4.052562236785889, 'sampling/importance_sampling_ratio/min': 0.017377791926264763, 'sampling/importance_sampling_ratio/mean': 1.000080943107605, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.7257137332981074e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 480/1024 [20:47:34<25:43:11, 170.20s/it][AINFO 12-01 16:18:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:18:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:18:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:18:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 481/1024 [20:50:19<25:28:32, 168.90s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020060548558831215, 'learning_rate': 1e-05, 'num_tokens': 411127620.0, 'completions/mean_length': 7315.4453125, 'completions/min_length': 1042.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7171.50048828125, 'completions/min_terminated_length': 1042.0, 'completions/max_terminated_length': 15489.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.29719969630241394, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020456131547689438, 'sampling/sampling_logp_difference/max': 8.453479766845703, 'sampling/importance_sampling_ratio/min': 0.00021315738558769226, 'sampling/importance_sampling_ratio/mean': 0.999945878982544, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.171837003421388e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 481/1024 [20:50:19<25:28:32, 168.90s/it][AINFO 12-01 16:21:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:21:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:21:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:21:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 47%|████▋     | 482/1024 [20:53:24<26:08:46, 173.67s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020907472353428602, 'learning_rate': 1e-05, 'num_tokens': 412172988.0, 'completions/mean_length': 8022.8125, 'completions/min_length': 825.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7753.0966796875, 'completions/min_terminated_length': 825.0, 'completions/max_terminated_length': 16260.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3135228157043457, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019644854590296745, 'sampling/sampling_logp_difference/max': 2.7379026412963867, 'sampling/importance_sampling_ratio/min': 0.06470591574907303, 'sampling/importance_sampling_ratio/mean': 1.0000015497207642, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.494255171674013e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 482/1024 [20:53:24<26:08:46, 173.67s/it][AINFO 12-01 16:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:24:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 47%|████▋     | 483/1024 [20:56:25<26:25:38, 175.86s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019314356613904238, 'learning_rate': 1e-05, 'num_tokens': 413067876.0, 'completions/mean_length': 6838.625, 'completions/min_length': 739.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6450.6015625, 'completions/min_terminated_length': 739.0, 'completions/max_terminated_length': 15563.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018709510564804077, 'sampling/sampling_logp_difference/max': 2.1127967834472656, 'sampling/importance_sampling_ratio/min': 0.12089936435222626, 'sampling/importance_sampling_ratio/mean': 1.0000275373458862, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.244513538902538e-05, 'epoch': 0.44}
+
+ 47%|████▋     | 483/1024 [20:56:25<26:25:38, 175.86s/it][AINFO 12-01 16:27:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:27:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:27:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:27:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 47%|████▋     | 484/1024 [20:59:12<25:57:38, 173.07s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002679441822692752, 'learning_rate': 1e-05, 'num_tokens': 413983818.0, 'completions/mean_length': 7016.109375, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6555.39306640625, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 16286.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.26037710905075073, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019316546618938446, 'sampling/sampling_logp_difference/max': 2.1967577934265137, 'sampling/importance_sampling_ratio/min': 0.11116298288106918, 'sampling/importance_sampling_ratio/mean': 0.999997615814209, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.290292412813869e-05, 'epoch': 0.45}
+
+ 47%|████▋     | 484/1024 [20:59:12<25:57:38, 173.07s/it][AINFO 12-01 16:30:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:30:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:30:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:30:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 47%|████▋     | 485/1024 [21:02:06<25:57:49, 173.41s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018422137945890427, 'learning_rate': 1e-05, 'num_tokens': 415035161.0, 'completions/mean_length': 8083.8671875, 'completions/min_length': 1110.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7746.462890625, 'completions/min_terminated_length': 1110.0, 'completions/max_terminated_length': 16284.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020070167258381844, 'sampling/sampling_logp_difference/max': 1.3861682415008545, 'sampling/importance_sampling_ratio/min': 0.2500315308570862, 'sampling/importance_sampling_ratio/mean': 1.0000115633010864, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.853248646621068e-05, 'epoch': 0.45}
+
+ 47%|████▋     | 485/1024 [21:02:06<25:57:49, 173.41s/it][AINFO 12-01 16:33:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:33:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:33:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:33:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 47%|████▋     | 486/1024 [21:04:43<25:10:30, 168.46s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022582169622182846, 'learning_rate': 1e-05, 'num_tokens': 415982166.0, 'completions/mean_length': 7242.1015625, 'completions/min_length': 939.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7170.1181640625, 'completions/min_terminated_length': 939.0, 'completions/max_terminated_length': 15501.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2937847375869751, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02070865035057068, 'sampling/sampling_logp_difference/max': 9.894331932067871, 'sampling/importance_sampling_ratio/min': 5.045988655183464e-05, 'sampling/importance_sampling_ratio/mean': 1.0000041723251343, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.327805263277696e-05, 'epoch': 0.45}
+
+ 47%|████▋     | 486/1024 [21:04:43<25:10:30, 168.46s/it][AINFO 12-01 16:36:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:36:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:36:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:36:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 487/1024 [21:07:12<24:16:03, 162.69s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001935614156536758, 'learning_rate': 1e-05, 'num_tokens': 416745991.0, 'completions/mean_length': 5833.6953125, 'completions/min_length': 758.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5750.6220703125, 'completions/min_terminated_length': 758.0, 'completions/max_terminated_length': 15784.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3066929280757904, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02075774408876896, 'sampling/sampling_logp_difference/max': 3.1871089935302734, 'sampling/importance_sampling_ratio/min': 0.041291072964668274, 'sampling/importance_sampling_ratio/mean': 0.9999702572822571, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.145325647186837e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 487/1024 [21:07:12<24:16:03, 162.69s/it][AINFO 12-01 16:38:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:38:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:38:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:38:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 488/1024 [21:09:33<23:15:40, 156.23s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0023738518357276917, 'learning_rate': 1e-05, 'num_tokens': 417512552.0, 'completions/mean_length': 5795.2578125, 'completions/min_length': 598.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5711.8818359375, 'completions/min_terminated_length': 598.0, 'completions/max_terminated_length': 15612.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01893053948879242, 'sampling/sampling_logp_difference/max': 4.779662132263184, 'sampling/importance_sampling_ratio/min': 0.008398836478590965, 'sampling/importance_sampling_ratio/mean': 0.9999260902404785, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.208109005001461e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 488/1024 [21:09:33<23:15:40, 156.23s/it][AINFO 12-01 16:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:40:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:40:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 489/1024 [21:12:15<23:27:21, 157.83s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020515904761850834, 'learning_rate': 1e-05, 'num_tokens': 418467381.0, 'completions/mean_length': 7315.7890625, 'completions/min_length': 449.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6869.81103515625, 'completions/min_terminated_length': 449.0, 'completions/max_terminated_length': 16260.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02071664109826088, 'sampling/sampling_logp_difference/max': 4.072844505310059, 'sampling/importance_sampling_ratio/min': 0.01702888123691082, 'sampling/importance_sampling_ratio/mean': 0.9999706745147705, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.258190381278837e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 489/1024 [21:12:15<23:27:21, 157.83s/it][AINFO 12-01 16:43:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:43:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:43:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:43:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 490/1024 [21:15:05<23:57:10, 161.48s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017514073988422751, 'learning_rate': 1e-05, 'num_tokens': 419359256.0, 'completions/mean_length': 6807.1484375, 'completions/min_length': 1399.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6417.84521484375, 'completions/min_terminated_length': 1399.0, 'completions/max_terminated_length': 15691.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.37822139263153076, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018606113269925117, 'sampling/sampling_logp_difference/max': 4.961199760437012, 'sampling/importance_sampling_ratio/min': 0.007004518993198872, 'sampling/importance_sampling_ratio/mean': 1.0000120401382446, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.48781924105424e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 490/1024 [21:15:05<23:57:10, 161.48s/it][AINFO 12-01 16:46:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:46:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:46:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:46:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 491/1024 [21:17:45<23:51:54, 161.19s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014053891645744443, 'learning_rate': 1e-05, 'num_tokens': 420372711.0, 'completions/mean_length': 7767.8671875, 'completions/min_length': 945.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7631.103515625, 'completions/min_terminated_length': 945.0, 'completions/max_terminated_length': 13975.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02200550213456154, 'sampling/sampling_logp_difference/max': 3.155059337615967, 'sampling/importance_sampling_ratio/min': 0.04263587296009064, 'sampling/importance_sampling_ratio/mean': 0.999976634979248, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4024706792479265e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 491/1024 [21:17:45<23:51:54, 161.19s/it][AINFO 12-01 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:49:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 492/1024 [21:20:35<24:12:09, 163.78s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.00586542347446084, 'learning_rate': 1e-05, 'num_tokens': 421314130.0, 'completions/mean_length': 7207.2109375, 'completions/min_length': 791.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6986.96826171875, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 16358.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.21648237109184265, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021378377452492714, 'sampling/sampling_logp_difference/max': 2.316610097885132, 'sampling/importance_sampling_ratio/min': 0.09860729426145554, 'sampling/importance_sampling_ratio/mean': 0.9999175071716309, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.085744486317708e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 492/1024 [21:20:35<24:12:09, 163.78s/it][AINFO 12-01 16:51:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:51:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:51:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:51:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 493/1024 [21:23:35<24:52:14, 168.62s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017660683952271938, 'learning_rate': 1e-05, 'num_tokens': 422385762.0, 'completions/mean_length': 8238.3125, 'completions/min_length': 1410.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7907.1865234375, 'completions/min_terminated_length': 1410.0, 'completions/max_terminated_length': 15224.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.30745434761047363, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018216565251350403, 'sampling/sampling_logp_difference/max': 5.268735885620117, 'sampling/importance_sampling_ratio/min': 0.005150116980075836, 'sampling/importance_sampling_ratio/mean': 0.999977707862854, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6629832013422856e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 493/1024 [21:23:35<24:52:14, 168.62s/it][AINFO 12-01 16:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:54:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:54:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 48%|████▊     | 494/1024 [21:26:40<25:32:01, 173.44s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0024857246316969395, 'learning_rate': 1e-05, 'num_tokens': 423399667.0, 'completions/mean_length': 7773.5078125, 'completions/min_length': 379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7566.8564453125, 'completions/min_terminated_length': 379.0, 'completions/max_terminated_length': 16184.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.22461044788360596, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020727891474962234, 'sampling/sampling_logp_difference/max': 7.203612327575684, 'sampling/importance_sampling_ratio/min': 0.0007438937900587916, 'sampling/importance_sampling_ratio/mean': 1.0000083446502686, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.455329954249464e-05, 'epoch': 0.45}
+
+ 48%|████▊     | 494/1024 [21:26:40<25:32:01, 173.44s/it][AINFO 12-01 16:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:57:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 16:57:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 495/1024 [21:29:22<24:58:37, 169.98s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0017703508492559195, 'learning_rate': 1e-05, 'num_tokens': 424249203.0, 'completions/mean_length': 6493.4375, 'completions/min_length': 1219.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6415.55908203125, 'completions/min_terminated_length': 1219.0, 'completions/max_terminated_length': 16359.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01903335377573967, 'sampling/sampling_logp_difference/max': 6.969728469848633, 'sampling/importance_sampling_ratio/min': 0.0009399080881848931, 'sampling/importance_sampling_ratio/mean': 1.000041127204895, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2991083066444844e-05, 'epoch': 0.46}
+
+ 48%|████▊     | 495/1024 [21:29:22<24:58:37, 169.98s/it][AINFO 12-01 17:00:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:00:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:00:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:00:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 48%|████▊     | 496/1024 [21:31:57<24:16:34, 165.52s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020254296250641346, 'learning_rate': 1e-05, 'num_tokens': 425247915.0, 'completions/mean_length': 7662.8125, 'completions/min_length': 1330.0, 'completions/max_length': 15071.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7662.8125, 'completions/min_terminated_length': 1330.0, 'completions/max_terminated_length': 15071.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.33903974294662476, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021763909608125687, 'sampling/sampling_logp_difference/max': 2.469316244125366, 'sampling/importance_sampling_ratio/min': 0.08464271575212479, 'sampling/importance_sampling_ratio/mean': 0.9999833106994629, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.239442612037237e-05, 'epoch': 0.46}
+
+ 48%|████▊     | 496/1024 [21:31:57<24:16:34, 165.52s/it][AINFO 12-01 17:03:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:03:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:03:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:03:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▊     | 497/1024 [21:34:21<23:17:15, 159.08s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013405976351350546, 'learning_rate': 1e-05, 'num_tokens': 426069918.0, 'completions/mean_length': 6270.6484375, 'completions/min_length': 1135.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6191.015625, 'completions/min_terminated_length': 1135.0, 'completions/max_terminated_length': 14800.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3395765423774719, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.0177823007106781, 'sampling/sampling_logp_difference/max': 2.2334771156311035, 'sampling/importance_sampling_ratio/min': 0.10715518891811371, 'sampling/importance_sampling_ratio/mean': 0.9999721050262451, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6949055104050785e-05, 'epoch': 0.46}
+
+ 49%|████▊     | 497/1024 [21:34:21<23:17:15, 159.08s/it][AINFO 12-01 17:05:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:05:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:05:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:05:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▊     | 498/1024 [21:37:04<23:25:22, 160.31s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0025121544022113085, 'learning_rate': 1e-05, 'num_tokens': 426960077.0, 'completions/mean_length': 6814.2421875, 'completions/min_length': 505.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6505.5400390625, 'completions/min_terminated_length': 505.0, 'completions/max_terminated_length': 15909.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.30221715569496155, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01963714323937893, 'sampling/sampling_logp_difference/max': 2.9061880111694336, 'sampling/importance_sampling_ratio/min': 0.05468378961086273, 'sampling/importance_sampling_ratio/mean': 1.0000885725021362, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.471796430356335e-05, 'epoch': 0.46}
+
+ 49%|████▊     | 498/1024 [21:37:04<23:25:22, 160.31s/it][AINFO 12-01 17:08:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:08:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:08:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:08:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 49%|████▊     | 499/1024 [21:39:40<23:10:11, 158.88s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013243460562080145, 'learning_rate': 1e-05, 'num_tokens': 427985002.0, 'completions/mean_length': 7860.8515625, 'completions/min_length': 1555.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7514.3818359375, 'completions/min_terminated_length': 1555.0, 'completions/max_terminated_length': 14700.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01987369731068611, 'sampling/sampling_logp_difference/max': 3.026093006134033, 'sampling/importance_sampling_ratio/min': 0.0485047772526741, 'sampling/importance_sampling_ratio/mean': 1.0000321865081787, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.989253900726908e-05, 'epoch': 0.46}
+
+ 49%|████▊     | 499/1024 [21:39:40<23:10:11, 158.88s/it][AINFO 12-01 17:10:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:10:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:10:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:10:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▉     | 500/1024 [21:42:45<24:16:27, 166.77s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002385817002505064, 'learning_rate': 1e-05, 'num_tokens': 429015898.0, 'completions/mean_length': 7911.0, 'completions/min_length': 786.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7420.826171875, 'completions/min_terminated_length': 786.0, 'completions/max_terminated_length': 16033.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02090851217508316, 'sampling/sampling_logp_difference/max': 2.042759418487549, 'sampling/importance_sampling_ratio/min': 0.13760042190551758, 'sampling/importance_sampling_ratio/mean': 1.000044822692871, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.90786623636086e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 500/1024 [21:42:45<24:16:27, 166.77s/it][AINFO 12-01 17:14:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:14:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:14:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:14:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▉     | 501/1024 [21:45:23<23:52:35, 164.35s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0025584690738469362, 'learning_rate': 1e-05, 'num_tokens': 429878190.0, 'completions/mean_length': 6578.65625, 'completions/min_length': 646.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6343.328125, 'completions/min_terminated_length': 646.0, 'completions/max_terminated_length': 14235.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.26826781034469604, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020747404545545578, 'sampling/sampling_logp_difference/max': 6.593681812286377, 'sampling/importance_sampling_ratio/min': 0.0013689902843907475, 'sampling/importance_sampling_ratio/mean': 0.9999889135360718, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.773775089641276e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 501/1024 [21:45:23<23:52:35, 164.35s/it][AINFO 12-01 17:16:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:16:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:16:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:16:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▉     | 502/1024 [21:48:38<25:08:37, 173.41s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015530067030340433, 'learning_rate': 1e-05, 'num_tokens': 430887005.0, 'completions/mean_length': 7718.4296875, 'completions/min_length': 1402.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7140.7255859375, 'completions/min_terminated_length': 1402.0, 'completions/max_terminated_length': 15979.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.22331714630126953, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020114637911319733, 'sampling/sampling_logp_difference/max': 2.276700735092163, 'sampling/importance_sampling_ratio/min': 0.10262222588062286, 'sampling/importance_sampling_ratio/mean': 1.0000476837158203, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.763543481407396e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 502/1024 [21:48:38<25:08:37, 173.41s/it][AINFO 12-01 17:19:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:19:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:19:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:19:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▉     | 503/1024 [21:51:47<25:45:37, 178.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015679238131269813, 'learning_rate': 1e-05, 'num_tokens': 431878360.0, 'completions/mean_length': 7596.9609375, 'completions/min_length': 1610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7239.76416015625, 'completions/min_terminated_length': 1610.0, 'completions/max_terminated_length': 16188.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01939081773161888, 'sampling/sampling_logp_difference/max': 4.175537586212158, 'sampling/importance_sampling_ratio/min': 0.015366928651928902, 'sampling/importance_sampling_ratio/mean': 1.0000818967819214, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3728557443500904e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 503/1024 [21:51:47<25:45:37, 178.00s/it][AINFO 12-01 17:23:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:23:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:23:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:23:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▉     | 504/1024 [21:55:02<26:26:50, 183.10s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019014077261090279, 'learning_rate': 1e-05, 'num_tokens': 432923673.0, 'completions/mean_length': 8023.3203125, 'completions/min_length': 1022.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7314.7880859375, 'completions/min_terminated_length': 1022.0, 'completions/max_terminated_length': 16277.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018529443070292473, 'sampling/sampling_logp_difference/max': 2.020641803741455, 'sampling/importance_sampling_ratio/min': 0.13257035613059998, 'sampling/importance_sampling_ratio/mean': 1.0000507831573486, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.091822410780878e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 504/1024 [21:55:02<26:26:50, 183.10s/it][AINFO 12-01 17:26:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:26:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:26:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:26:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▉     | 505/1024 [21:57:48<25:40:34, 178.10s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.005913345608860254, 'learning_rate': 1e-05, 'num_tokens': 433743656.0, 'completions/mean_length': 6243.1171875, 'completions/min_length': 675.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5915.99169921875, 'completions/min_terminated_length': 675.0, 'completions/max_terminated_length': 16267.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2704022228717804, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020300734788179398, 'sampling/sampling_logp_difference/max': 10.362953186035156, 'sampling/importance_sampling_ratio/min': 3.158105391776189e-05, 'sampling/importance_sampling_ratio/mean': 1.000060796737671, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.417603279056493e-05, 'epoch': 0.46}
+
+ 49%|████▉     | 505/1024 [21:57:48<25:40:34, 178.10s/it][AINFO 12-01 17:29:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:29:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:29:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:29:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 49%|████▉     | 506/1024 [22:01:06<26:27:55, 183.93s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001586702885106206, 'learning_rate': 1e-05, 'num_tokens': 434812135.0, 'completions/mean_length': 8184.1171875, 'completions/min_length': 1070.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7563.95849609375, 'completions/min_terminated_length': 1070.0, 'completions/max_terminated_length': 15828.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3227166533470154, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021054696291685104, 'sampling/sampling_logp_difference/max': 2.247175931930542, 'sampling/importance_sampling_ratio/min': 0.10569729655981064, 'sampling/importance_sampling_ratio/mean': 1.0000395774841309, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.254285563045414e-05, 'epoch': 0.47}
+
+ 49%|████▉     | 506/1024 [22:01:06<26:27:55, 183.93s/it][AINFO 12-01 17:32:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:32:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:32:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:32:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|████▉     | 507/1024 [22:04:22<26:57:44, 187.74s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018974852282553911, 'learning_rate': 1e-05, 'num_tokens': 435697456.0, 'completions/mean_length': 6760.8203125, 'completions/min_length': 970.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6204.107421875, 'completions/min_terminated_length': 970.0, 'completions/max_terminated_length': 16208.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.29955869913101196, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019445277750492096, 'sampling/sampling_logp_difference/max': 1.5147712230682373, 'sampling/importance_sampling_ratio/min': 0.21985848248004913, 'sampling/importance_sampling_ratio/mean': 0.9999649524688721, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.402056763159635e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 507/1024 [22:04:22<26:57:44, 187.74s/it][AINFO 12-01 17:35:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:35:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:35:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:35:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|████▉     | 508/1024 [22:07:00<25:36:12, 178.63s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0011319608893245459, 'learning_rate': 1e-05, 'num_tokens': 436661427.0, 'completions/mean_length': 7358.5234375, 'completions/min_length': 717.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7287.45654296875, 'completions/min_terminated_length': 717.0, 'completions/max_terminated_length': 15947.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.23304283618927002, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022408289834856987, 'sampling/sampling_logp_difference/max': 5.24223518371582, 'sampling/importance_sampling_ratio/min': 0.005288423039019108, 'sampling/importance_sampling_ratio/mean': 0.9999656677246094, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.448265985956823e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 508/1024 [22:07:00<25:36:12, 178.63s/it][AINFO 12-01 17:38:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:38:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:38:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:38:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|████▉     | 509/1024 [22:09:47<25:04:52, 175.33s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002320842118933797, 'learning_rate': 1e-05, 'num_tokens': 437588987.0, 'completions/mean_length': 7125.6875, 'completions/min_length': 986.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7052.78759765625, 'completions/min_terminated_length': 986.0, 'completions/max_terminated_length': 15782.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.34876546263694763, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02005033940076828, 'sampling/sampling_logp_difference/max': 6.680258274078369, 'sampling/importance_sampling_ratio/min': 0.0012554536806419492, 'sampling/importance_sampling_ratio/mean': 0.9999493956565857, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.4395032975662616e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 509/1024 [22:09:47<25:04:52, 175.33s/it][AINFO 12-01 17:41:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:41:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:41:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:41:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|████▉     | 510/1024 [22:12:50<25:21:17, 177.58s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0015851956559345126, 'learning_rate': 1e-05, 'num_tokens': 438696613.0, 'completions/mean_length': 8502.203125, 'completions/min_length': 1061.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8181.8046875, 'completions/min_terminated_length': 1061.0, 'completions/max_terminated_length': 15864.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.24435339868068695, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020654216408729553, 'sampling/sampling_logp_difference/max': 11.790275573730469, 'sampling/importance_sampling_ratio/min': 7.577891210530652e-06, 'sampling/importance_sampling_ratio/mean': 0.9999713897705078, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2160486575303366e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 510/1024 [22:12:50<25:21:17, 177.58s/it][AINFO 12-01 17:44:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:44:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:44:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:44:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 50%|████▉     | 511/1024 [22:15:35<24:44:37, 173.64s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010662841377779841, 'learning_rate': 1e-05, 'num_tokens': 439640934.0, 'completions/mean_length': 7232.6953125, 'completions/min_length': 1010.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6937.49169921875, 'completions/min_terminated_length': 1010.0, 'completions/max_terminated_length': 15887.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2738093435764313, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020428404211997986, 'sampling/sampling_logp_difference/max': 7.68292760848999, 'sampling/importance_sampling_ratio/min': 0.0004606244037859142, 'sampling/importance_sampling_ratio/mean': 1.0000033378601074, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.653045445162206e-05, 'epoch': 0.47}
+
+ 50%|████▉     | 511/1024 [22:15:35<24:44:37, 173.64s/it][AINFO 12-01 17:46:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:46:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:46:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:46:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 512/1024 [22:18:37<25:03:16, 176.17s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0030004396103322506, 'learning_rate': 1e-05, 'num_tokens': 440524442.0, 'completions/mean_length': 6758.15625, 'completions/min_length': 719.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6447.64501953125, 'completions/min_terminated_length': 719.0, 'completions/max_terminated_length': 16320.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.24489018321037292, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018582195043563843, 'sampling/sampling_logp_difference/max': 1.9700236320495605, 'sampling/importance_sampling_ratio/min': 0.13945356011390686, 'sampling/importance_sampling_ratio/mean': 0.9999950528144836, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3073954177307314e-05, 'epoch': 0.47}
+
+ 50%|█████     | 512/1024 [22:18:37<25:03:16, 176.17s/it][AINFO 12-01 17:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:49:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:49:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 513/1024 [22:21:32<24:59:26, 176.06s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.001875787740573287, 'learning_rate': 1e-05, 'num_tokens': 441476032.0, 'completions/mean_length': 7283.109375, 'completions/min_length': 1069.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6427.47021484375, 'completions/min_terminated_length': 1069.0, 'completions/max_terminated_length': 15737.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2675113379955292, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01935657486319542, 'sampling/sampling_logp_difference/max': 4.438741683959961, 'sampling/importance_sampling_ratio/min': 0.011810790747404099, 'sampling/importance_sampling_ratio/mean': 0.9998928308486938, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.597766678671178e-05, 'epoch': 0.47}
+
+ 50%|█████     | 513/1024 [22:21:32<24:59:26, 176.06s/it][AINFO 12-01 17:52:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:52:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:52:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:52:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 514/1024 [22:24:12<24:13:43, 171.03s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002408356172963977, 'learning_rate': 1e-05, 'num_tokens': 442336173.0, 'completions/mean_length': 6587.1640625, 'completions/min_length': 687.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6510.0234375, 'completions/min_terminated_length': 687.0, 'completions/max_terminated_length': 15725.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021009862422943115, 'sampling/sampling_logp_difference/max': 12.08456039428711, 'sampling/importance_sampling_ratio/min': 5.646016234095441e-06, 'sampling/importance_sampling_ratio/mean': 1.0000665187835693, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.18821296079841e-05, 'epoch': 0.47}
+
+ 50%|█████     | 514/1024 [22:24:12<24:13:43, 171.03s/it][AINFO 12-01 17:55:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:55:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:55:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:55:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 515/1024 [22:27:00<24:03:09, 170.12s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0023354636505246162, 'learning_rate': 1e-05, 'num_tokens': 443262471.0, 'completions/mean_length': 7068.140625, 'completions/min_length': 1388.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6994.78759765625, 'completions/min_terminated_length': 1388.0, 'completions/max_terminated_length': 14534.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020581800490617752, 'sampling/sampling_logp_difference/max': 2.0917153358459473, 'sampling/importance_sampling_ratio/min': 0.12347515672445297, 'sampling/importance_sampling_ratio/mean': 1.000001311302185, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.195709650251956e-05, 'epoch': 0.47}
+
+ 50%|█████     | 515/1024 [22:27:00<24:03:09, 170.12s/it][AINFO 12-01 17:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:58:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 17:58:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 516/1024 [22:29:42<23:39:47, 167.69s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002407216699793935, 'learning_rate': 1e-05, 'num_tokens': 444255069.0, 'completions/mean_length': 7611.984375, 'completions/min_length': 992.0, 'completions/max_length': 15396.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7611.984375, 'completions/min_terminated_length': 992.0, 'completions/max_terminated_length': 15396.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3543020486831665, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01980050653219223, 'sampling/sampling_logp_difference/max': 5.5009002685546875, 'sampling/importance_sampling_ratio/min': 0.00408309418708086, 'sampling/importance_sampling_ratio/mean': 0.9999485015869141, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.8752083305080305e-05, 'epoch': 0.47}
+
+ 50%|█████     | 516/1024 [22:29:42<23:39:47, 167.69s/it][AINFO 12-01 18:00:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:00:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:00:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:00:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 50%|█████     | 517/1024 [22:32:17<23:04:09, 163.80s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002516290871426463, 'learning_rate': 1e-05, 'num_tokens': 445108368.0, 'completions/mean_length': 6484.4609375, 'completions/min_length': 433.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6165.12060546875, 'completions/min_terminated_length': 433.0, 'completions/max_terminated_length': 15627.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.28353503346443176, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020371951162815094, 'sampling/sampling_logp_difference/max': 2.2164852619171143, 'sampling/importance_sampling_ratio/min': 0.10899151861667633, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.336508257234527e-05, 'epoch': 0.48}
+
+ 50%|█████     | 517/1024 [22:32:17<23:04:09, 163.80s/it][AINFO 12-01 18:03:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:03:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:03:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:03:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████     | 518/1024 [22:34:41<22:13:39, 158.14s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0034705176949501038, 'learning_rate': 1e-05, 'num_tokens': 445911705.0, 'completions/mean_length': 6135.6328125, 'completions/min_length': 1389.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6054.93701171875, 'completions/min_terminated_length': 1389.0, 'completions/max_terminated_length': 15756.0, 'rewards/accuracy_reward/mean': 0.6015625, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.6015625, 'reward_std': 0.2988022267818451, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01926891878247261, 'sampling/sampling_logp_difference/max': 1.238471508026123, 'sampling/importance_sampling_ratio/min': 0.29158928990364075, 'sampling/importance_sampling_ratio/mean': 1.0000296831130981, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.325073768995935e-05, 'epoch': 0.48}
+
+ 51%|█████     | 518/1024 [22:34:41<22:13:39, 158.14s/it][AINFO 12-01 18:05:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:05:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:05:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:05:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████     | 519/1024 [22:37:28<22:32:12, 160.66s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0018766947323456407, 'learning_rate': 1e-05, 'num_tokens': 446940505.0, 'completions/mean_length': 7886.0, 'completions/min_length': 624.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7611.87060546875, 'completions/min_terminated_length': 624.0, 'completions/max_terminated_length': 15573.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0201311893761158, 'sampling/sampling_logp_difference/max': 4.1327009201049805, 'sampling/importance_sampling_ratio/min': 0.016039498150348663, 'sampling/importance_sampling_ratio/mean': 1.0000088214874268, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.198384744493524e-05, 'epoch': 0.48}
+
+ 51%|█████     | 519/1024 [22:37:28<22:32:12, 160.66s/it][AINFO 12-01 18:08:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:08:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:08:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:08:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████     | 520/1024 [22:40:22<23:02:49, 164.62s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020135007798671722, 'learning_rate': 1e-05, 'num_tokens': 447922390.0, 'completions/mean_length': 7511.2890625, 'completions/min_length': 972.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7074.92578125, 'completions/min_terminated_length': 972.0, 'completions/max_terminated_length': 16078.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.27145031094551086, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02050921693444252, 'sampling/sampling_logp_difference/max': 1.9890000820159912, 'sampling/importance_sampling_ratio/min': 0.13683216273784637, 'sampling/importance_sampling_ratio/mean': 0.999992847442627, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.165645436340128e-05, 'epoch': 0.48}
+
+ 51%|█████     | 520/1024 [22:40:22<23:02:49, 164.62s/it][AINFO 12-01 18:11:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:11:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:11:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:11:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████     | 521/1024 [22:43:24<23:43:50, 169.84s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019559739157557487, 'learning_rate': 1e-05, 'num_tokens': 448848046.0, 'completions/mean_length': 7088.1875, 'completions/min_length': 630.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6940.63525390625, 'completions/min_terminated_length': 630.0, 'completions/max_terminated_length': 16038.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.17859892547130585, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018242474645376205, 'sampling/sampling_logp_difference/max': 8.647464752197266, 'sampling/importance_sampling_ratio/min': 0.00017557140381541103, 'sampling/importance_sampling_ratio/mean': 1.0000606775283813, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.751451393123716e-05, 'epoch': 0.48}
+
+ 51%|█████     | 521/1024 [22:43:24<23:43:50, 169.84s/it][AINFO 12-01 18:14:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:14:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:14:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:14:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 522/1024 [22:46:20<23:57:12, 171.78s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0038597413804382086, 'learning_rate': 1e-05, 'num_tokens': 449810621.0, 'completions/mean_length': 7376.4296875, 'completions/min_length': 631.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6933.43408203125, 'completions/min_terminated_length': 631.0, 'completions/max_terminated_length': 16069.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2754020392894745, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019085779786109924, 'sampling/sampling_logp_difference/max': 2.1275720596313477, 'sampling/importance_sampling_ratio/min': 0.11912617087364197, 'sampling/importance_sampling_ratio/mean': 1.0000689029693604, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.168338741943444e-05, 'epoch': 0.48}
+
+ 51%|█████     | 522/1024 [22:46:20<23:57:12, 171.78s/it][AINFO 12-01 18:17:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:17:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:17:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:17:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 51%|█████     | 523/1024 [22:48:57<23:16:34, 167.25s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016637073131278157, 'learning_rate': 1e-05, 'num_tokens': 450678051.0, 'completions/mean_length': 6633.109375, 'completions/min_length': 861.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6556.33056640625, 'completions/min_terminated_length': 861.0, 'completions/max_terminated_length': 15757.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.20805485546588898, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019459716975688934, 'sampling/sampling_logp_difference/max': 1.6547937393188477, 'sampling/importance_sampling_ratio/min': 0.19113145768642426, 'sampling/importance_sampling_ratio/mean': 0.9999922513961792, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.680475356304669e-05, 'epoch': 0.48}
+
+ 51%|█████     | 523/1024 [22:48:57<23:16:34, 167.25s/it][AINFO 12-01 18:20:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:20:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:20:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:20:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████     | 524/1024 [22:51:27<22:31:19, 162.16s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013125581899657845, 'learning_rate': 1e-05, 'num_tokens': 451494598.0, 'completions/mean_length': 6244.8984375, 'completions/min_length': 790.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6001.56005859375, 'completions/min_terminated_length': 790.0, 'completions/max_terminated_length': 16200.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01995152235031128, 'sampling/sampling_logp_difference/max': 5.125833034515381, 'sampling/importance_sampling_ratio/min': 0.005941266193985939, 'sampling/importance_sampling_ratio/mean': 1.0000550746917725, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.361373546795221e-05, 'epoch': 0.48}
+
+ 51%|█████     | 524/1024 [22:51:27<22:31:19, 162.16s/it][AINFO 12-01 18:22:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:22:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:22:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:22:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████▏    | 525/1024 [22:54:25<23:07:37, 166.85s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003693200647830963, 'learning_rate': 1e-05, 'num_tokens': 452506603.0, 'completions/mean_length': 7762.8515625, 'completions/min_length': 1521.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7694.96826171875, 'completions/min_terminated_length': 1521.0, 'completions/max_terminated_length': 16090.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.2120065689086914, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0207878015935421, 'sampling/sampling_logp_difference/max': 2.4040207862854004, 'sampling/importance_sampling_ratio/min': 0.09035392850637436, 'sampling/importance_sampling_ratio/mean': 1.0001039505004883, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.205934342531691e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 525/1024 [22:54:25<23:07:37, 166.85s/it][AINFO 12-01 18:25:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:25:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:25:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:25:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████▏    | 526/1024 [22:57:20<23:26:25, 169.45s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019487823592498899, 'learning_rate': 1e-05, 'num_tokens': 453603789.0, 'completions/mean_length': 8403.078125, 'completions/min_length': 1723.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8145.62890625, 'completions/min_terminated_length': 1723.0, 'completions/max_terminated_length': 15991.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.401616632938385, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.021674305200576782, 'sampling/sampling_logp_difference/max': 2.989922523498535, 'sampling/importance_sampling_ratio/min': 0.050291333347558975, 'sampling/importance_sampling_ratio/mean': 1.0000473260879517, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.384051557186467e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 526/1024 [22:57:20<23:26:25, 169.45s/it][AINFO 12-01 18:28:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:28:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:28:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:28:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 51%|█████▏    | 527/1024 [23:00:08<23:19:44, 168.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001249633845873177, 'learning_rate': 1e-05, 'num_tokens': 454623629.0, 'completions/mean_length': 7792.3125, 'completions/min_length': 1391.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7586.1123046875, 'completions/min_terminated_length': 1391.0, 'completions/max_terminated_length': 16073.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.25620076060295105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021152272820472717, 'sampling/sampling_logp_difference/max': 1.82162606716156, 'sampling/importance_sampling_ratio/min': 0.16176250576972961, 'sampling/importance_sampling_ratio/mean': 0.9999831914901733, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.840433686898905e-05, 'epoch': 0.48}
+
+ 51%|█████▏    | 527/1024 [23:00:08<23:19:44, 168.98s/it][AINFO 12-01 18:31:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:31:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:31:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:31:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 52%|█████▏    | 528/1024 [23:02:48<22:54:29, 166.27s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011957770911976695, 'learning_rate': 1e-05, 'num_tokens': 455619869.0, 'completions/mean_length': 7622.125, 'completions/min_length': 2127.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7339.48388671875, 'completions/min_terminated_length': 2127.0, 'completions/max_terminated_length': 15840.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.1990984082221985, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01841442473232746, 'sampling/sampling_logp_difference/max': 3.0315840244293213, 'sampling/importance_sampling_ratio/min': 0.04823916405439377, 'sampling/importance_sampling_ratio/mean': 0.999976396560669, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.292655464472773e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 528/1024 [23:02:48<22:54:29, 166.27s/it][AINFO 12-01 18:34:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:34:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:34:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:34:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 529/1024 [23:05:33<22:48:05, 165.83s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018044528551399708, 'learning_rate': 1e-05, 'num_tokens': 456645102.0, 'completions/mean_length': 7872.0703125, 'completions/min_length': 1412.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7736.9609375, 'completions/min_terminated_length': 1412.0, 'completions/max_terminated_length': 15951.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.29955869913101196, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019718319177627563, 'sampling/sampling_logp_difference/max': 1.5711593627929688, 'sampling/importance_sampling_ratio/min': 0.2078041285276413, 'sampling/importance_sampling_ratio/mean': 0.99998939037323, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.930985053557379e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 529/1024 [23:05:33<22:48:05, 165.83s/it][AINFO 12-01 18:36:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:36:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:36:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:36:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 530/1024 [23:08:31<23:16:14, 169.58s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0017170588253065944, 'learning_rate': 1e-05, 'num_tokens': 457793703.0, 'completions/mean_length': 8830.8828125, 'completions/min_length': 2325.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8587.2333984375, 'completions/min_terminated_length': 2325.0, 'completions/max_terminated_length': 15837.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.24329258501529694, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021323256194591522, 'sampling/sampling_logp_difference/max': 4.755186557769775, 'sampling/importance_sampling_ratio/min': 0.008606938645243645, 'sampling/importance_sampling_ratio/mean': 0.9999715089797974, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.566113241504354e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 530/1024 [23:08:31<23:16:14, 169.58s/it][AINFO 12-01 18:39:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:39:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:39:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:39:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 52%|█████▏    | 531/1024 [23:11:43<24:07:05, 176.12s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018092576647177339, 'learning_rate': 1e-05, 'num_tokens': 458798754.0, 'completions/mean_length': 7689.0859375, 'completions/min_length': 915.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7551.07177734375, 'completions/min_terminated_length': 915.0, 'completions/max_terminated_length': 15897.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3066929280757904, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02048235759139061, 'sampling/sampling_logp_difference/max': 4.885918617248535, 'sampling/importance_sampling_ratio/min': 0.007552183233201504, 'sampling/importance_sampling_ratio/mean': 0.9999454021453857, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6522737022769434e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 531/1024 [23:11:43<24:07:05, 176.12s/it][AINFO 12-01 18:42:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:42:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:42:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:42:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 52%|█████▏    | 532/1024 [23:14:48<24:27:45, 179.00s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0021977140568196774, 'learning_rate': 1e-05, 'num_tokens': 459902313.0, 'completions/mean_length': 8442.5546875, 'completions/min_length': 620.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8119.7314453125, 'completions/min_terminated_length': 620.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021879252046346664, 'sampling/sampling_logp_difference/max': 1.6216278076171875, 'sampling/importance_sampling_ratio/min': 0.19757682085037231, 'sampling/importance_sampling_ratio/mean': 0.9999510049819946, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.446667255957436e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 532/1024 [23:14:48<24:27:45, 179.00s/it][AINFO 12-01 18:46:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:46:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:46:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:46:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 52%|█████▏    | 533/1024 [23:17:47<24:22:43, 178.74s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001114827347919345, 'learning_rate': 1e-05, 'num_tokens': 460901291.0, 'completions/mean_length': 7672.703125, 'completions/min_length': 1106.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6853.69287109375, 'completions/min_terminated_length': 1106.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.1462520956993103, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021045228466391563, 'sampling/sampling_logp_difference/max': 1.794968843460083, 'sampling/importance_sampling_ratio/min': 0.1661326289176941, 'sampling/importance_sampling_ratio/mean': 1.000037670135498, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.132354200057307e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 533/1024 [23:17:47<24:22:43, 178.74s/it][AINFO 12-01 18:49:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:49:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:49:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:49:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 52%|█████▏    | 534/1024 [23:20:40<24:06:38, 177.14s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018293894827365875, 'learning_rate': 1e-05, 'num_tokens': 461944718.0, 'completions/mean_length': 7998.5859375, 'completions/min_length': 1027.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7865.484375, 'completions/min_terminated_length': 1027.0, 'completions/max_terminated_length': 16040.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.24435338377952576, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02074873074889183, 'sampling/sampling_logp_difference/max': 12.602254867553711, 'sampling/importance_sampling_ratio/min': 3.3644203085714253e-06, 'sampling/importance_sampling_ratio/mean': 0.9999821186065674, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.386251777077632e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 534/1024 [23:20:40<24:06:38, 177.14s/it][AINFO 12-01 18:51:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:51:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:51:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:51:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 52%|█████▏    | 535/1024 [23:23:26<23:37:15, 173.90s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015607211971655488, 'learning_rate': 1e-05, 'num_tokens': 462913042.0, 'completions/mean_length': 7394.84375, 'completions/min_length': 1239.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7324.06298828125, 'completions/min_terminated_length': 1239.0, 'completions/max_terminated_length': 16302.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.29644322395324707, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021394766867160797, 'sampling/sampling_logp_difference/max': 2.8006763458251953, 'sampling/importance_sampling_ratio/min': 0.06076895073056221, 'sampling/importance_sampling_ratio/mean': 1.0000122785568237, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.761474267274025e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 535/1024 [23:23:26<23:37:15, 173.90s/it][AINFO 12-01 18:54:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:54:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:54:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:54:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 52%|█████▏    | 536/1024 [23:26:09<23:05:41, 170.37s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0029957920778542757, 'learning_rate': 1e-05, 'num_tokens': 463855758.0, 'completions/mean_length': 7204.78125, 'completions/min_length': 945.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6831.64208984375, 'completions/min_terminated_length': 945.0, 'completions/max_terminated_length': 16080.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29378965497016907, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021175671368837357, 'sampling/sampling_logp_difference/max': 6.429126262664795, 'sampling/importance_sampling_ratio/min': 0.0016138603677973151, 'sampling/importance_sampling_ratio/mean': 1.0000700950622559, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5069752104609506e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 536/1024 [23:26:09<23:05:41, 170.37s/it][AINFO 12-01 18:57:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:57:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:57:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 18:57:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 52%|█████▏    | 537/1024 [23:28:54<22:51:55, 169.03s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001456334488466382, 'learning_rate': 1e-05, 'num_tokens': 464769355.0, 'completions/mean_length': 6993.4140625, 'completions/min_length': 864.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6531.58154296875, 'completions/min_terminated_length': 864.0, 'completions/max_terminated_length': 16243.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.17859892547130585, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021541595458984375, 'sampling/sampling_logp_difference/max': 3.33123779296875, 'sampling/importance_sampling_ratio/min': 0.03574882820248604, 'sampling/importance_sampling_ratio/mean': 1.0000298023223877, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.749465099645022e-05, 'epoch': 0.49}
+
+ 52%|█████▏    | 537/1024 [23:28:54<22:51:55, 169.03s/it][AINFO 12-01 19:00:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:00:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:00:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:00:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 538/1024 [23:31:41<22:43:10, 168.29s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0025498478207737207, 'learning_rate': 1e-05, 'num_tokens': 465757284.0, 'completions/mean_length': 7547.3203125, 'completions/min_length': 1196.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7262.26611328125, 'completions/min_terminated_length': 1196.0, 'completions/max_terminated_length': 16196.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.31587693095207214, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021136075258255005, 'sampling/sampling_logp_difference/max': 3.021927833557129, 'sampling/importance_sampling_ratio/min': 0.048707228153944016, 'sampling/importance_sampling_ratio/mean': 1.0000312328338623, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.786437839105929e-05, 'epoch': 0.49}
+
+ 53%|█████▎    | 538/1024 [23:31:41<22:43:10, 168.29s/it][AINFO 12-01 19:02:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:02:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:02:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:02:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 539/1024 [23:34:11<21:55:09, 162.70s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.00446446193382144, 'learning_rate': 1e-05, 'num_tokens': 466628584.0, 'completions/mean_length': 6656.53125, 'completions/min_length': 700.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6579.93701171875, 'completions/min_terminated_length': 700.0, 'completions/max_terminated_length': 14856.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.23645778000354767, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020300280302762985, 'sampling/sampling_logp_difference/max': 4.564592361450195, 'sampling/importance_sampling_ratio/min': 0.01041412353515625, 'sampling/importance_sampling_ratio/mean': 0.9999902844429016, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.575560524244793e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 539/1024 [23:34:11<21:55:09, 162.70s/it][AINFO 12-01 19:05:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:05:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:05:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:05:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 53%|█████▎    | 540/1024 [23:37:02<22:12:17, 165.16s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0013115634210407734, 'learning_rate': 1e-05, 'num_tokens': 467579688.0, 'completions/mean_length': 7268.8125, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7050.04833984375, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2993262708187103, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02100731059908867, 'sampling/sampling_logp_difference/max': 4.713386535644531, 'sampling/importance_sampling_ratio/min': 0.008974334225058556, 'sampling/importance_sampling_ratio/mean': 1.0000804662704468, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.7152485169353895e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 540/1024 [23:37:02<22:12:17, 165.16s/it][AINFO 12-01 19:08:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:08:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:08:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:08:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 53%|█████▎    | 541/1024 [23:39:50<22:18:15, 166.24s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012581993360072374, 'learning_rate': 1e-05, 'num_tokens': 468623332.0, 'completions/mean_length': 8002.40625, 'completions/min_length': 1304.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7869.36572265625, 'completions/min_terminated_length': 1304.0, 'completions/max_terminated_length': 15754.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.28277361392974854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019561301916837692, 'sampling/sampling_logp_difference/max': 1.6438701152801514, 'sampling/importance_sampling_ratio/min': 0.1932307630777359, 'sampling/importance_sampling_ratio/mean': 1.0000278949737549, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2618698646256234e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 541/1024 [23:39:50<22:18:15, 166.24s/it][AINFO 12-01 19:11:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:11:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:11:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:11:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 53%|█████▎    | 542/1024 [23:43:06<23:26:29, 175.08s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0013315437827259302, 'learning_rate': 1e-05, 'num_tokens': 469637800.0, 'completions/mean_length': 7772.34375, 'completions/min_length': 799.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7565.66455078125, 'completions/min_terminated_length': 799.0, 'completions/max_terminated_length': 16152.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.28247907757759094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01942390948534012, 'sampling/sampling_logp_difference/max': 2.4601821899414062, 'sampling/importance_sampling_ratio/min': 0.08541938662528992, 'sampling/importance_sampling_ratio/mean': 1.0000251531600952, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.0213681359091424e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 542/1024 [23:43:06<23:26:29, 175.08s/it][AINFO 12-01 19:14:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:14:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:14:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:14:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 53%|█████▎    | 543/1024 [23:46:09<23:41:56, 177.37s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022080529015511274, 'learning_rate': 1e-05, 'num_tokens': 470671893.0, 'completions/mean_length': 7894.2265625, 'completions/min_length': 1457.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7549.11376953125, 'completions/min_terminated_length': 1457.0, 'completions/max_terminated_length': 16356.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.26698729395866394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02032581716775894, 'sampling/sampling_logp_difference/max': 2.1142935752868652, 'sampling/importance_sampling_ratio/min': 0.12071853876113892, 'sampling/importance_sampling_ratio/mean': 0.9999662637710571, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.417111575909075e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 543/1024 [23:46:09<23:41:56, 177.37s/it][AINFO 12-01 19:17:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:17:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:17:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:17:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 53%|█████▎    | 544/1024 [23:48:44<22:46:43, 170.84s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012946520000696182, 'learning_rate': 1e-05, 'num_tokens': 471611354.0, 'completions/mean_length': 7195.1015625, 'completions/min_length': 1720.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7049.24658203125, 'completions/min_terminated_length': 1720.0, 'completions/max_terminated_length': 15725.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.22225633263587952, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021039776504039764, 'sampling/sampling_logp_difference/max': 16.12366485595703, 'sampling/importance_sampling_ratio/min': 9.944462675548493e-08, 'sampling/importance_sampling_ratio/mean': 0.9999868869781494, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.005058485745394e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 544/1024 [23:48:44<22:46:43, 170.84s/it][AINFO 12-01 19:20:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:20:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:20:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:20:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 53%|█████▎    | 545/1024 [23:51:52<23:24:58, 175.99s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019653867930173874, 'learning_rate': 1e-05, 'num_tokens': 472681073.0, 'completions/mean_length': 8144.0546875, 'completions/min_length': 643.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7946.29638671875, 'completions/min_terminated_length': 643.0, 'completions/max_terminated_length': 15928.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.23486506938934326, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02194231003522873, 'sampling/sampling_logp_difference/max': 2.795642852783203, 'sampling/importance_sampling_ratio/min': 0.0610755980014801, 'sampling/importance_sampling_ratio/mean': 0.9998981952667236, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.453159815260733e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 545/1024 [23:51:52<23:24:58, 175.99s/it][AINFO 12-01 19:23:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:23:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:23:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:23:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 53%|█████▎    | 546/1024 [23:54:34<22:48:36, 171.79s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018283744575455785, 'learning_rate': 1e-05, 'num_tokens': 473595045.0, 'completions/mean_length': 6995.03125, 'completions/min_length': 1098.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6846.00048828125, 'completions/min_terminated_length': 1098.0, 'completions/max_terminated_length': 14893.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.26409149169921875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018408093601465225, 'sampling/sampling_logp_difference/max': 5.31826114654541, 'sampling/importance_sampling_ratio/min': 0.004901268985122442, 'sampling/importance_sampling_ratio/mean': 0.9999638795852661, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.871349597124208e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 546/1024 [23:54:34<22:48:36, 171.79s/it][AINFO 12-01 19:25:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:25:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:25:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:25:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 53%|█████▎    | 547/1024 [23:57:03<21:51:35, 164.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017409659922122955, 'learning_rate': 1e-05, 'num_tokens': 474439180.0, 'completions/mean_length': 6450.2421875, 'completions/min_length': 1006.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6372.0234375, 'completions/min_terminated_length': 1006.0, 'completions/max_terminated_length': 15863.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.25513991713523865, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02010829746723175, 'sampling/sampling_logp_difference/max': 2.6709706783294678, 'sampling/importance_sampling_ratio/min': 0.0691850334405899, 'sampling/importance_sampling_ratio/mean': 0.9999823570251465, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.9286873629862384e-05, 'epoch': 0.5}
+
+ 53%|█████▎    | 547/1024 [23:57:03<21:51:35, 164.98s/it][AINFO 12-01 19:28:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:28:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:28:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:28:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 54%|█████▎    | 548/1024 [23:59:38<21:25:10, 162.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0025896199513226748, 'learning_rate': 1e-05, 'num_tokens': 475418208.0, 'completions/mean_length': 7508.03125, 'completions/min_length': 1108.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7221.70947265625, 'completions/min_terminated_length': 1108.0, 'completions/max_terminated_length': 15907.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.1990984082221985, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01911240443587303, 'sampling/sampling_logp_difference/max': 4.128934860229492, 'sampling/importance_sampling_ratio/min': 0.016100019216537476, 'sampling/importance_sampling_ratio/mean': 1.0000269412994385, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1956918633113673e-05, 'epoch': 0.5}
+
+ 54%|█████▎    | 548/1024 [23:59:38<21:25:10, 162.00s/it][AINFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▎    | 549/1024 [24:02:27<21:38:12, 163.98s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0008814946049824357, 'learning_rate': 1e-05, 'num_tokens': 476450744.0, 'completions/mean_length': 7901.6875, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7767.0478515625, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 15852.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019257351756095886, 'sampling/sampling_logp_difference/max': 5.614462852478027, 'sampling/importance_sampling_ratio/min': 0.0036447669845074415, 'sampling/importance_sampling_ratio/mean': 1.0000337362289429, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.484168448470882e-05, 'epoch': 0.51}
+
+ 54%|█████▎    | 549/1024 [24:02:27<21:38:12, 163.98s/it][AINFO 12-01 19:33:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:33:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:33:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:33:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 54%|█████▎    | 550/1024 [24:05:11<21:34:20, 163.84s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0022036773152649403, 'learning_rate': 1e-05, 'num_tokens': 477331248.0, 'completions/mean_length': 6720.6875, 'completions/min_length': 658.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6245.4423828125, 'completions/min_terminated_length': 658.0, 'completions/max_terminated_length': 15483.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019462421536445618, 'sampling/sampling_logp_difference/max': 4.82817268371582, 'sampling/importance_sampling_ratio/min': 0.00800112821161747, 'sampling/importance_sampling_ratio/mean': 0.9999768733978271, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.428426852129633e-05, 'epoch': 0.51}
+
+ 54%|█████▎    | 550/1024 [24:05:11<21:34:20, 163.84s/it][AINFO 12-01 19:36:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:36:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:36:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:36:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 54%|█████▍    | 551/1024 [24:08:16<22:22:04, 170.24s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.005742369685322046, 'learning_rate': 1e-05, 'num_tokens': 478389425.0, 'completions/mean_length': 8094.7578125, 'completions/min_length': 753.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7687.08984375, 'completions/min_terminated_length': 753.0, 'completions/max_terminated_length': 16199.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02236468903720379, 'sampling/sampling_logp_difference/max': 7.055948257446289, 'sampling/importance_sampling_ratio/min': 0.0008622647146694362, 'sampling/importance_sampling_ratio/mean': 0.9999721646308899, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.33503871843277e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 551/1024 [24:08:16<22:22:04, 170.24s/it][AINFO 12-01 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:39:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 552/1024 [24:11:12<22:32:43, 171.96s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016262659337371588, 'learning_rate': 1e-05, 'num_tokens': 479417003.0, 'completions/mean_length': 7881.015625, 'completions/min_length': 1222.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7746.0478515625, 'completions/min_terminated_length': 1222.0, 'completions/max_terminated_length': 16114.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02033335343003273, 'sampling/sampling_logp_difference/max': 1.7305359840393066, 'sampling/importance_sampling_ratio/min': 0.1771894097328186, 'sampling/importance_sampling_ratio/mean': 1.0000284910202026, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.877987043983012e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 552/1024 [24:11:12<22:32:43, 171.96s/it][AINFO 12-01 19:42:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:42:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:42:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:42:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 54%|█████▍    | 553/1024 [24:13:59<22:18:23, 170.50s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001894370187073946, 'learning_rate': 1e-05, 'num_tokens': 480481433.0, 'completions/mean_length': 8146.421875, 'completions/min_length': 664.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7741.29443359375, 'completions/min_terminated_length': 664.0, 'completions/max_terminated_length': 16228.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2414703369140625, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020487213507294655, 'sampling/sampling_logp_difference/max': 4.670137882232666, 'sampling/importance_sampling_ratio/min': 0.009370977059006691, 'sampling/importance_sampling_ratio/mean': 1.000077724456787, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.532741647584771e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 553/1024 [24:13:59<22:18:23, 170.50s/it][AINFO 12-01 19:45:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:45:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:45:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:45:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 54%|█████▍    | 554/1024 [24:16:50<22:17:31, 170.75s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017238021828234196, 'learning_rate': 1e-05, 'num_tokens': 481551094.0, 'completions/mean_length': 8173.9140625, 'completions/min_length': 766.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7909.072265625, 'completions/min_terminated_length': 766.0, 'completions/max_terminated_length': 15901.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.3095887303352356, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020995572209358215, 'sampling/sampling_logp_difference/max': 2.604147434234619, 'sampling/importance_sampling_ratio/min': 0.07396616786718369, 'sampling/importance_sampling_ratio/mean': 0.999964714050293, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.736825682513881e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 554/1024 [24:16:50<22:17:31, 170.75s/it][AINFO 12-01 19:48:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:48:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:48:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:48:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 555/1024 [24:19:31<21:51:52, 167.83s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012160962214693427, 'learning_rate': 1e-05, 'num_tokens': 482561901.0, 'completions/mean_length': 7768.8671875, 'completions/min_length': 857.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7632.11962890625, 'completions/min_terminated_length': 857.0, 'completions/max_terminated_length': 16301.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020229395478963852, 'sampling/sampling_logp_difference/max': 1.750631332397461, 'sampling/importance_sampling_ratio/min': 0.17366427183151245, 'sampling/importance_sampling_ratio/mean': 0.9999764561653137, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.83942830619344e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 555/1024 [24:19:31<21:51:52, 167.83s/it][AINFO 12-01 19:50:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:50:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:50:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:50:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 54%|█████▍    | 556/1024 [24:22:09<21:25:38, 164.83s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001921795541420579, 'learning_rate': 1e-05, 'num_tokens': 483479574.0, 'completions/mean_length': 7011.8203125, 'completions/min_length': 1041.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6786.88818359375, 'completions/min_terminated_length': 1041.0, 'completions/max_terminated_length': 16284.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3185402750968933, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019257403910160065, 'sampling/sampling_logp_difference/max': 2.8713438510894775, 'sampling/importance_sampling_ratio/min': 0.05662278085947037, 'sampling/importance_sampling_ratio/mean': 1.0000029802322388, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.051968426210806e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 556/1024 [24:22:09<21:25:38, 164.83s/it][AINFO 12-01 19:53:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:53:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:53:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:53:26 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 557/1024 [24:25:04<21:45:39, 167.75s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0033293175511062145, 'learning_rate': 1e-05, 'num_tokens': 484501818.0, 'completions/mean_length': 7847.34375, 'completions/min_length': 1133.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7642.46435546875, 'completions/min_terminated_length': 1133.0, 'completions/max_terminated_length': 16163.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021052353084087372, 'sampling/sampling_logp_difference/max': 1.8202447891235352, 'sampling/importance_sampling_ratio/min': 0.16715112328529358, 'sampling/importance_sampling_ratio/mean': 1.0000097751617432, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.651085382396559e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 557/1024 [24:25:04<21:45:39, 167.75s/it][AINFO 12-01 19:56:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:56:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:56:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:56:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 54%|█████▍    | 558/1024 [24:28:22<22:53:17, 176.82s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0030512013472616673, 'learning_rate': 1e-05, 'num_tokens': 485585878.0, 'completions/mean_length': 8321.78125, 'completions/min_length': 1053.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7638.54248046875, 'completions/min_terminated_length': 1053.0, 'completions/max_terminated_length': 15909.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2777610719203949, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019362540915608406, 'sampling/sampling_logp_difference/max': 5.791250228881836, 'sampling/importance_sampling_ratio/min': 0.003054161323234439, 'sampling/importance_sampling_ratio/mean': 0.9999992847442627, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.656639862332668e-05, 'epoch': 0.51}
+
+ 54%|█████▍    | 558/1024 [24:28:22<22:53:17, 176.82s/it][AINFO 12-01 19:59:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:59:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:59:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 19:59:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▍    | 559/1024 [24:31:21<22:56:06, 177.56s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016040641348809004, 'learning_rate': 1e-05, 'num_tokens': 486756408.0, 'completions/mean_length': 9000.640625, 'completions/min_length': 1297.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 8236.8447265625, 'completions/min_terminated_length': 1297.0, 'completions/max_terminated_length': 16360.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2924865782260895, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021108610555529594, 'sampling/sampling_logp_difference/max': 5.6532793045043945, 'sampling/importance_sampling_ratio/min': 0.003506000619381666, 'sampling/importance_sampling_ratio/mean': 0.9999677538871765, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.202935492183315e-05, 'epoch': 0.51}
+
+ 55%|█████▍    | 559/1024 [24:31:21<22:56:06, 177.56s/it][AINFO 12-01 20:02:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:02:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:02:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:02:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 55%|█████▍    | 560/1024 [24:34:16<22:47:34, 176.84s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0008708125678822398, 'learning_rate': 1e-05, 'num_tokens': 487692673.0, 'completions/mean_length': 7172.0703125, 'completions/min_length': 385.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6797.6015625, 'completions/min_terminated_length': 385.0, 'completions/max_terminated_length': 16090.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020119396969676018, 'sampling/sampling_logp_difference/max': 7.688281059265137, 'sampling/importance_sampling_ratio/min': 0.00045816507190465927, 'sampling/importance_sampling_ratio/mean': 1.0000100135803223, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.9596292001297115e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 560/1024 [24:34:16<22:47:34, 176.84s/it][AINFO 12-01 20:05:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:05:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:05:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:05:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▍    | 561/1024 [24:37:40<23:47:04, 184.93s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002455002861097455, 'learning_rate': 1e-05, 'num_tokens': 488836089.0, 'completions/mean_length': 8787.8125, 'completions/min_length': 1818.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8479.0244140625, 'completions/min_terminated_length': 1818.0, 'completions/max_terminated_length': 15735.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.22461533546447754, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021100696176290512, 'sampling/sampling_logp_difference/max': 7.083743095397949, 'sampling/importance_sampling_ratio/min': 0.0008386282133869827, 'sampling/importance_sampling_ratio/mean': 1.0000200271606445, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.595222642616136e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 561/1024 [24:37:40<23:47:04, 184.93s/it][AINFO 12-01 20:08:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:08:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:08:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:08:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▍    | 562/1024 [24:40:41<23:34:13, 183.67s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015431091887876391, 'learning_rate': 1e-05, 'num_tokens': 489784790.0, 'completions/mean_length': 7271.4765625, 'completions/min_length': 1288.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7052.7763671875, 'completions/min_terminated_length': 1288.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.20805485546588898, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01891586184501648, 'sampling/sampling_logp_difference/max': 6.014634132385254, 'sampling/importance_sampling_ratio/min': 0.0024427417665719986, 'sampling/importance_sampling_ratio/mean': 1.0000656843185425, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4572081005990185e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 562/1024 [24:40:41<23:34:13, 183.67s/it][AINFO 12-01 20:11:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:11:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:11:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:11:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▍    | 563/1024 [24:43:40<23:21:16, 182.38s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.001758500817231834, 'learning_rate': 1e-05, 'num_tokens': 490779225.0, 'completions/mean_length': 7610.0234375, 'completions/min_length': 1026.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7326.99169921875, 'completions/min_terminated_length': 1026.0, 'completions/max_terminated_length': 15833.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.4581822156906128, 'frac_reward_zero_std': 0.0, 'sampling/sampling_logp_difference/mean': 0.019409511238336563, 'sampling/sampling_logp_difference/max': 5.97169303894043, 'sampling/importance_sampling_ratio/min': 0.0025499206967651844, 'sampling/importance_sampling_ratio/mean': 1.0000450611114502, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.064394387474749e-05, 'epoch': 0.52}
+
+ 55%|█████▍    | 563/1024 [24:43:40<23:21:16, 182.38s/it][AINFO 12-01 20:14:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:14:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:14:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:14:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▌    | 564/1024 [24:46:28<22:46:12, 178.20s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002212122082710266, 'learning_rate': 1e-05, 'num_tokens': 491630755.0, 'completions/mean_length': 6502.828125, 'completions/min_length': 1091.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5844.08349609375, 'completions/min_terminated_length': 1091.0, 'completions/max_terminated_length': 16192.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.19226360321044922, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01961512677371502, 'sampling/sampling_logp_difference/max': 3.275958299636841, 'sampling/importance_sampling_ratio/min': 0.03778064623475075, 'sampling/importance_sampling_ratio/mean': 1.0000507831573486, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1768863891556975e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 564/1024 [24:46:28<22:46:12, 178.20s/it][AINFO 12-01 20:17:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:17:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:17:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:17:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▌    | 565/1024 [24:49:25<22:38:49, 177.62s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.00169914192520082, 'learning_rate': 1e-05, 'num_tokens': 492649942.0, 'completions/mean_length': 7803.8984375, 'completions/min_length': 1323.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7527.12060546875, 'completions/min_terminated_length': 1323.0, 'completions/max_terminated_length': 16215.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.35218530893325806, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021758798509836197, 'sampling/sampling_logp_difference/max': 2.109866142272949, 'sampling/importance_sampling_ratio/min': 0.12125419825315475, 'sampling/importance_sampling_ratio/mean': 1.0000271797180176, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.319998024333472e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 565/1024 [24:49:25<22:38:49, 177.62s/it][AINFO 12-01 20:20:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:20:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:20:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:20:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▌    | 566/1024 [24:52:22<22:35:41, 177.60s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015316123608499765, 'learning_rate': 1e-05, 'num_tokens': 493771255.0, 'completions/mean_length': 8596.2578125, 'completions/min_length': 862.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8213.25390625, 'completions/min_terminated_length': 862.0, 'completions/max_terminated_length': 15732.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.17912298440933228, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02033020183444023, 'sampling/sampling_logp_difference/max': 2.402879238128662, 'sampling/importance_sampling_ratio/min': 0.09045713394880295, 'sampling/importance_sampling_ratio/mean': 1.0000083446502686, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0221210465697368e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 566/1024 [24:52:22<22:35:41, 177.60s/it][AINFO 12-01 20:23:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:23:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:23:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:23:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▌    | 567/1024 [24:54:59<21:44:09, 171.22s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001833951915614307, 'learning_rate': 1e-05, 'num_tokens': 494715257.0, 'completions/mean_length': 7223.265625, 'completions/min_length': 684.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6850.8779296875, 'completions/min_terminated_length': 684.0, 'completions/max_terminated_length': 16129.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.12756997346878052, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.01968134008347988, 'sampling/sampling_logp_difference/max': 2.2761504650115967, 'sampling/importance_sampling_ratio/min': 0.10267870873212814, 'sampling/importance_sampling_ratio/mean': 1.0000797510147095, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.6466386227875773e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 567/1024 [24:54:59<21:44:09, 171.22s/it][AINFO 12-01 20:26:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:26:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:26:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:26:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 55%|█████▌    | 568/1024 [24:57:45<21:31:02, 169.87s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011065925937145948, 'learning_rate': 1e-05, 'num_tokens': 495709584.0, 'completions/mean_length': 7619.4921875, 'completions/min_length': 725.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7336.76611328125, 'completions/min_terminated_length': 725.0, 'completions/max_terminated_length': 16077.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.25513991713523865, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021267499774694443, 'sampling/sampling_logp_difference/max': 6.56168794631958, 'sampling/importance_sampling_ratio/min': 0.0014134978409856558, 'sampling/importance_sampling_ratio/mean': 0.9999502301216125, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.948424222879112e-05, 'epoch': 0.52}
+
+ 55%|█████▌    | 568/1024 [24:57:45<21:31:02, 169.87s/it][AINFO 12-01 20:29:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:29:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:29:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:29:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▌    | 569/1024 [25:00:45<21:50:18, 172.79s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0026233852840960026, 'learning_rate': 1e-05, 'num_tokens': 496747170.0, 'completions/mean_length': 7935.453125, 'completions/min_length': 1022.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7446.69384765625, 'completions/min_terminated_length': 1022.0, 'completions/max_terminated_length': 16187.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.4395000636577606, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.02145254611968994, 'sampling/sampling_logp_difference/max': 3.0041255950927734, 'sampling/importance_sampling_ratio/min': 0.049582090228796005, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 0.00010599580309644807, 'epoch': 0.52}
+
+ 56%|█████▌    | 569/1024 [25:00:45<21:50:18, 172.79s/it][AINFO 12-01 20:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:32:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▌    | 570/1024 [25:03:55<22:26:43, 177.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019762783776968718, 'learning_rate': 1e-05, 'num_tokens': 497856147.0, 'completions/mean_length': 8517.0703125, 'completions/min_length': 697.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7922.0927734375, 'completions/min_terminated_length': 697.0, 'completions/max_terminated_length': 16186.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.31140607595443726, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018956255167722702, 'sampling/sampling_logp_difference/max': 2.121992826461792, 'sampling/importance_sampling_ratio/min': 0.11979266256093979, 'sampling/importance_sampling_ratio/mean': 0.9999479651451111, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.895184156339383e-05, 'epoch': 0.52}
+
+ 56%|█████▌    | 570/1024 [25:03:55<22:26:43, 177.98s/it][AINFO 12-01 20:35:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:35:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:35:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:35:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 571/1024 [25:06:54<22:25:42, 178.24s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.0016577065689489245, 'learning_rate': 1e-05, 'num_tokens': 498831469.0, 'completions/mean_length': 7460.703125, 'completions/min_length': 1040.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7246.54443359375, 'completions/min_terminated_length': 1040.0, 'completions/max_terminated_length': 16016.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.24147525429725647, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019963182508945465, 'sampling/sampling_logp_difference/max': 1.3070106506347656, 'sampling/importance_sampling_ratio/min': 0.27062785625457764, 'sampling/importance_sampling_ratio/mean': 0.9999527931213379, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.501526513602585e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 571/1024 [25:06:54<22:25:42, 178.24s/it][AINFO 12-01 20:38:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:38:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:38:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:38:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▌    | 572/1024 [25:10:12<23:08:39, 184.34s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020632974337786436, 'learning_rate': 1e-05, 'num_tokens': 499789935.0, 'completions/mean_length': 7339.640625, 'completions/min_length': 676.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6894.83544921875, 'completions/min_terminated_length': 676.0, 'completions/max_terminated_length': 16186.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.265913724899292, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019339144229888916, 'sampling/sampling_logp_difference/max': 2.609947681427002, 'sampling/importance_sampling_ratio/min': 0.0735384002327919, 'sampling/importance_sampling_ratio/mean': 0.9999982118606567, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.986245246778708e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 572/1024 [25:10:12<23:08:39, 184.34s/it][AINFO 12-01 20:41:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:41:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:41:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:41:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▌    | 573/1024 [25:13:01<22:29:43, 179.56s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012788083404302597, 'learning_rate': 1e-05, 'num_tokens': 500604217.0, 'completions/mean_length': 6204.578125, 'completions/min_length': 864.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6043.00048828125, 'completions/min_terminated_length': 864.0, 'completions/max_terminated_length': 15765.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.20805485546588898, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019603053107857704, 'sampling/sampling_logp_difference/max': 6.073004722595215, 'sampling/importance_sampling_ratio/min': 0.002304239198565483, 'sampling/importance_sampling_ratio/mean': 0.9999653100967407, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.9461109332041815e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 573/1024 [25:13:01<22:29:43, 179.56s/it][AINFO 12-01 20:44:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:44:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:44:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:44:17 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 574/1024 [25:16:13<22:54:52, 183.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016148993745446205, 'learning_rate': 1e-05, 'num_tokens': 501698407.0, 'completions/mean_length': 8379.671875, 'completions/min_length': 1166.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7916.611328125, 'completions/min_terminated_length': 1166.0, 'completions/max_terminated_length': 15875.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020054586231708527, 'sampling/sampling_logp_difference/max': 2.671325922012329, 'sampling/importance_sampling_ratio/min': 0.19347453117370605, 'sampling/importance_sampling_ratio/mean': 1.0000286102294922, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2419247253965295e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 574/1024 [25:16:13<22:54:52, 183.32s/it][AINFO 12-01 20:47:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:47:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:47:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:47:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 56%|█████▌    | 575/1024 [25:18:53<21:59:15, 176.29s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0015374928480014205, 'learning_rate': 1e-05, 'num_tokens': 502579529.0, 'completions/mean_length': 6718.515625, 'completions/min_length': 962.0, 'completions/max_length': 15415.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6718.515625, 'completions/min_terminated_length': 962.0, 'completions/max_terminated_length': 15415.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2774616479873657, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0207870751619339, 'sampling/sampling_logp_difference/max': 3.864809274673462, 'sampling/importance_sampling_ratio/min': 0.020966922864317894, 'sampling/importance_sampling_ratio/mean': 1.000058889389038, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.462685808448441e-05, 'epoch': 0.53}
+
+ 56%|█████▌    | 575/1024 [25:18:53<21:59:15, 176.29s/it][AINFO 12-01 20:50:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:50:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:50:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:50:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▋    | 576/1024 [25:21:59<22:18:00, 179.20s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011769720586016774, 'learning_rate': 1e-05, 'num_tokens': 503493607.0, 'completions/mean_length': 6942.296875, 'completions/min_length': 1081.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6867.95263671875, 'completions/min_terminated_length': 1081.0, 'completions/max_terminated_length': 16144.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.21018433570861816, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020076727494597435, 'sampling/sampling_logp_difference/max': 3.448981285095215, 'sampling/importance_sampling_ratio/min': 0.031777992844581604, 'sampling/importance_sampling_ratio/mean': 1.0000274181365967, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.584303473440741e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 576/1024 [25:21:59<22:18:00, 179.20s/it][AINFO 12-01 20:53:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:53:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:53:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:53:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▋    | 577/1024 [25:25:02<22:24:41, 180.50s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014960706466808915, 'learning_rate': 1e-05, 'num_tokens': 504542161.0, 'completions/mean_length': 8001.765625, 'completions/min_length': 1791.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7516.8427734375, 'completions/min_terminated_length': 1791.0, 'completions/max_terminated_length': 15845.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.21488474309444427, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02026309072971344, 'sampling/sampling_logp_difference/max': 2.6403040885925293, 'sampling/importance_sampling_ratio/min': 0.07133956998586655, 'sampling/importance_sampling_ratio/mean': 0.9999983906745911, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.826115471383673e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 577/1024 [25:25:02<22:24:41, 180.50s/it][AINFO 12-01 20:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:56:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:56:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 56%|█████▋    | 578/1024 [25:27:47<21:46:09, 175.72s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017694135894998908, 'learning_rate': 1e-05, 'num_tokens': 505480788.0, 'completions/mean_length': 7213.0859375, 'completions/min_length': 706.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6601.69189453125, 'completions/min_terminated_length': 706.0, 'completions/max_terminated_length': 15285.0, 'rewards/accuracy_reward/mean': 0.6171875, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.6171875, 'reward_std': 0.325370192527771, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017670247703790665, 'sampling/sampling_logp_difference/max': 4.247172832489014, 'sampling/importance_sampling_ratio/min': 0.01430461835116148, 'sampling/importance_sampling_ratio/mean': 1.0000206232070923, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.932937528996263e-05, 'epoch': 0.53}
+
+ 56%|█████▋    | 578/1024 [25:27:47<21:46:09, 175.72s/it][AINFO 12-01 20:59:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:59:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:59:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 20:59:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 579/1024 [25:30:09<20:28:13, 165.60s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002069150796160102, 'learning_rate': 1e-05, 'num_tokens': 506318206.0, 'completions/mean_length': 6386.078125, 'completions/min_length': 956.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6307.3544921875, 'completions/min_terminated_length': 956.0, 'completions/max_terminated_length': 15553.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02102750912308693, 'sampling/sampling_logp_difference/max': 2.7496814727783203, 'sampling/importance_sampling_ratio/min': 0.06394822895526886, 'sampling/importance_sampling_ratio/mean': 1.0000425577163696, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.447664929964958e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 579/1024 [25:30:09<20:28:13, 165.60s/it][AINFO 12-01 21:01:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:01:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:01:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:01:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 580/1024 [25:33:27<21:36:59, 175.27s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0036231856793165207, 'learning_rate': 1e-05, 'num_tokens': 507543306.0, 'completions/mean_length': 9405.78125, 'completions/min_length': 470.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 9062.58984375, 'completions/min_terminated_length': 470.0, 'completions/max_terminated_length': 15486.0, 'rewards/accuracy_reward/mean': 0.171875, 'rewards/accuracy_reward/std': 0.3787541687488556, 'reward': 0.171875, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02045777067542076, 'sampling/sampling_logp_difference/max': 2.430084228515625, 'sampling/importance_sampling_ratio/min': 0.0880294144153595, 'sampling/importance_sampling_ratio/mean': 1.0000121593475342, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.855945146824524e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 580/1024 [25:33:27<21:36:59, 175.27s/it][AINFO 12-01 21:04:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:04:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:04:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:04:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 581/1024 [25:36:13<21:13:25, 172.47s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018074079416692257, 'learning_rate': 1e-05, 'num_tokens': 508561381.0, 'completions/mean_length': 7807.0234375, 'completions/min_length': 1097.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7530.3466796875, 'completions/min_terminated_length': 1097.0, 'completions/max_terminated_length': 16188.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3158867359161377, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02001657523214817, 'sampling/sampling_logp_difference/max': 5.831778526306152, 'sampling/importance_sampling_ratio/min': 0.0029328560922294855, 'sampling/importance_sampling_ratio/mean': 0.9999621510505676, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.105470765760401e-05, 'epoch': 0.53}
+
+ 57%|█████▋    | 581/1024 [25:36:13<21:13:25, 172.47s/it][AINFO 12-01 21:07:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:07:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:07:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:07:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 57%|█████▋    | 582/1024 [25:38:49<20:34:05, 167.52s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012882085284218192, 'learning_rate': 1e-05, 'num_tokens': 509342496.0, 'completions/mean_length': 5933.7109375, 'completions/min_length': 758.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5682.904296875, 'completions/min_terminated_length': 758.0, 'completions/max_terminated_length': 15527.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3232485055923462, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019918859004974365, 'sampling/sampling_logp_difference/max': 5.360018253326416, 'sampling/importance_sampling_ratio/min': 0.004700820427387953, 'sampling/importance_sampling_ratio/mean': 1.0000150203704834, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6891026588345994e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 582/1024 [25:38:49<20:34:05, 167.52s/it][AINFO 12-01 21:10:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:10:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:10:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:10:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 583/1024 [25:41:27<20:10:57, 164.76s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001399575499817729, 'learning_rate': 1e-05, 'num_tokens': 510261476.0, 'completions/mean_length': 7035.53125, 'completions/min_length': 962.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6811.16845703125, 'completions/min_terminated_length': 962.0, 'completions/max_terminated_length': 16347.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.19780510663986206, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021985257044434547, 'sampling/sampling_logp_difference/max': 3.045283317565918, 'sampling/importance_sampling_ratio/min': 0.047582827508449554, 'sampling/importance_sampling_ratio/mean': 1.0000382661819458, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3468454325411585e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 583/1024 [25:41:27<20:10:57, 164.76s/it][AINFO 12-01 21:12:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:12:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:12:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:12:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 584/1024 [25:43:50<19:21:09, 158.34s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0014903623377904296, 'learning_rate': 1e-05, 'num_tokens': 511081526.0, 'completions/mean_length': 6274.078125, 'completions/min_length': 999.0, 'completions/max_length': 15487.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6274.078125, 'completions/min_terminated_length': 999.0, 'completions/max_terminated_length': 15487.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.28747400641441345, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0190611332654953, 'sampling/sampling_logp_difference/max': 2.266602039337158, 'sampling/importance_sampling_ratio/min': 0.10366382449865341, 'sampling/importance_sampling_ratio/mean': 1.0000481605529785, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.54950172752433e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 584/1024 [25:43:50<19:21:09, 158.34s/it][AINFO 12-01 21:15:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:15:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:15:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:15:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 585/1024 [25:46:39<19:41:34, 161.49s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001599375274963677, 'learning_rate': 1e-05, 'num_tokens': 512094570.0, 'completions/mean_length': 7746.09375, 'completions/min_length': 981.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7538.7841796875, 'completions/min_terminated_length': 981.0, 'completions/max_terminated_length': 15999.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02181185409426689, 'sampling/sampling_logp_difference/max': 1.6803226470947266, 'sampling/importance_sampling_ratio/min': 0.18631383776664734, 'sampling/importance_sampling_ratio/mean': 1.000056266784668, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7378545787069015e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 585/1024 [25:46:39<19:41:34, 161.49s/it][AINFO 12-01 21:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:17:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:17:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 586/1024 [25:49:22<19:42:19, 161.96s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001955796265974641, 'learning_rate': 1e-05, 'num_tokens': 513116082.0, 'completions/mean_length': 7838.3125, 'completions/min_length': 1122.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7633.21630859375, 'completions/min_terminated_length': 1122.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019127119332551956, 'sampling/sampling_logp_difference/max': 13.176095962524414, 'sampling/importance_sampling_ratio/min': 1.8953710423375014e-06, 'sampling/importance_sampling_ratio/mean': 1.000051736831665, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.310327196639264e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 586/1024 [25:49:22<19:42:19, 161.96s/it][AINFO 12-01 21:20:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:20:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:20:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:20:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 587/1024 [25:52:18<20:10:10, 166.16s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.003028444480150938, 'learning_rate': 1e-05, 'num_tokens': 514144539.0, 'completions/mean_length': 7891.1328125, 'completions/min_length': 1416.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7756.32568359375, 'completions/min_terminated_length': 1416.0, 'completions/max_terminated_length': 16256.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021172503009438515, 'sampling/sampling_logp_difference/max': 4.695827007293701, 'sampling/importance_sampling_ratio/min': 0.009133310988545418, 'sampling/importance_sampling_ratio/mean': 1.000010371208191, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.622857527967426e-05, 'epoch': 0.54}
+
+ 57%|█████▋    | 587/1024 [25:52:18<20:10:10, 166.16s/it][AINFO 12-01 21:23:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:23:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:23:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:23:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 57%|█████▋    | 588/1024 [25:55:26<20:53:57, 172.56s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017828773707151413, 'learning_rate': 1e-05, 'num_tokens': 515278144.0, 'completions/mean_length': 8709.3515625, 'completions/min_length': 1162.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7915.42236328125, 'completions/min_terminated_length': 1162.0, 'completions/max_terminated_length': 16374.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3855929672718048, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019248347729444504, 'sampling/sampling_logp_difference/max': 7.3514628410339355, 'sampling/importance_sampling_ratio/min': 0.0006416530231945217, 'sampling/importance_sampling_ratio/mean': 1.000035047531128, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 0.00010056014480142039, 'epoch': 0.54}
+
+ 57%|█████▋    | 588/1024 [25:55:26<20:53:57, 172.56s/it][AINFO 12-01 21:26:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:26:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:26:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:26:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 589/1024 [25:58:17<20:47:57, 172.13s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019230139441788197, 'learning_rate': 1e-05, 'num_tokens': 516291690.0, 'completions/mean_length': 7735.515625, 'completions/min_length': 668.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7598.23876953125, 'completions/min_terminated_length': 668.0, 'completions/max_terminated_length': 16303.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3327290117740631, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020843125879764557, 'sampling/sampling_logp_difference/max': 7.920862674713135, 'sampling/importance_sampling_ratio/min': 0.0003630889405030757, 'sampling/importance_sampling_ratio/mean': 0.9999810457229614, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6425849834340625e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 589/1024 [25:58:17<20:47:57, 172.13s/it][AINFO 12-01 21:29:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:29:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:29:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:29:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 590/1024 [26:01:24<21:17:45, 176.65s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001919507049024105, 'learning_rate': 1e-05, 'num_tokens': 517328853.0, 'completions/mean_length': 7942.2109375, 'completions/min_length': 861.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7599.04833984375, 'completions/min_terminated_length': 861.0, 'completions/max_terminated_length': 16378.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.27168768644332886, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01977304369211197, 'sampling/sampling_logp_difference/max': 2.109370231628418, 'sampling/importance_sampling_ratio/min': 0.12131434679031372, 'sampling/importance_sampling_ratio/mean': 1.0000510215759277, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.468843710332294e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 590/1024 [26:01:24<21:17:45, 176.65s/it][AINFO 12-01 21:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:32:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:32:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 58%|█████▊    | 591/1024 [26:04:34<21:44:03, 180.70s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001157480524852872, 'learning_rate': 1e-05, 'num_tokens': 518419766.0, 'completions/mean_length': 8355.3203125, 'completions/min_length': 1405.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7600.48779296875, 'completions/min_terminated_length': 1405.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.24541422724723816, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020228758454322815, 'sampling/sampling_logp_difference/max': 2.3082211017608643, 'sampling/importance_sampling_ratio/min': 0.09943798184394836, 'sampling/importance_sampling_ratio/mean': 1.0000561475753784, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.4619784830501885e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 591/1024 [26:04:34<21:44:03, 180.70s/it][AINFO 12-01 21:35:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:35:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:35:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:35:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 592/1024 [26:07:33<21:36:17, 180.04s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007985631818883121, 'learning_rate': 1e-05, 'num_tokens': 519590349.0, 'completions/mean_length': 8976.1171875, 'completions/min_length': 1904.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 8917.787109375, 'completions/min_terminated_length': 1904.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.023678822442889214, 'sampling/sampling_logp_difference/max': 4.576128005981445, 'sampling/importance_sampling_ratio/min': 0.01029468048363924, 'sampling/importance_sampling_ratio/mean': 1.0000450611114502, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.935972149018198e-05, 'epoch': 0.54}
+
+ 58%|█████▊    | 592/1024 [26:07:33<21:36:17, 180.04s/it][AINFO 12-01 21:38:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:38:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:38:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:38:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 593/1024 [26:10:14<20:52:22, 174.35s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018190189730376005, 'learning_rate': 1e-05, 'num_tokens': 520541013.0, 'completions/mean_length': 7280.25, 'completions/min_length': 862.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7135.74658203125, 'completions/min_terminated_length': 862.0, 'completions/max_terminated_length': 16200.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3243142366409302, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019840437918901443, 'sampling/sampling_logp_difference/max': 10.633879661560059, 'sampling/importance_sampling_ratio/min': 2.408600448688958e-05, 'sampling/importance_sampling_ratio/mean': 0.999998152256012, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.545264336135006e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 593/1024 [26:10:14<20:52:22, 174.35s/it][AINFO 12-01 21:41:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:41:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:41:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:41:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 594/1024 [26:13:25<21:26:06, 179.46s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007153755286708474, 'learning_rate': 1e-05, 'num_tokens': 521551520.0, 'completions/mean_length': 7746.7734375, 'completions/min_length': 787.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7468.15283203125, 'completions/min_terminated_length': 787.0, 'completions/max_terminated_length': 16210.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.09863808751106262, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.0206192284822464, 'sampling/sampling_logp_difference/max': 11.103106498718262, 'sampling/importance_sampling_ratio/min': 1.5065450497786514e-05, 'sampling/importance_sampling_ratio/mean': 1.000016212463379, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.3120661662924249e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 594/1024 [26:13:25<21:26:06, 179.46s/it][AINFO 12-01 21:44:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:44:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:44:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:44:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 595/1024 [26:16:39<21:53:55, 183.77s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013517775805667043, 'learning_rate': 1e-05, 'num_tokens': 522616029.0, 'completions/mean_length': 8171.4140625, 'completions/min_length': 1307.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7623.90869140625, 'completions/min_terminated_length': 1307.0, 'completions/max_terminated_length': 15623.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.17912298440933228, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019750583916902542, 'sampling/sampling_logp_difference/max': 2.5026142597198486, 'sampling/importance_sampling_ratio/min': 0.0818706825375557, 'sampling/importance_sampling_ratio/mean': 1.0000152587890625, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.0229263839155465e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 595/1024 [26:16:39<21:53:55, 183.77s/it][AINFO 12-01 21:47:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:47:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:47:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:47:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 596/1024 [26:19:36<21:37:03, 181.83s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0011024746345356107, 'learning_rate': 1e-05, 'num_tokens': 523696287.0, 'completions/mean_length': 8302.015625, 'completions/min_length': 1610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8041.30615234375, 'completions/min_terminated_length': 1610.0, 'completions/max_terminated_length': 15762.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019759435206651688, 'sampling/sampling_logp_difference/max': 1.9021872282028198, 'sampling/importance_sampling_ratio/min': 0.1492418348789215, 'sampling/importance_sampling_ratio/mean': 0.9999597072601318, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.4279155190452e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 596/1024 [26:19:36<21:37:03, 181.83s/it][AINFO 12-01 21:50:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:50:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:50:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:50:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 597/1024 [26:22:09<20:33:08, 173.28s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0030766907148063183, 'learning_rate': 1e-05, 'num_tokens': 524506247.0, 'completions/mean_length': 6132.25, 'completions/min_length': 1024.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6051.52734375, 'completions/min_terminated_length': 1024.0, 'completions/max_terminated_length': 14610.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.41292232275009155, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.021624643355607986, 'sampling/sampling_logp_difference/max': 1.936978816986084, 'sampling/importance_sampling_ratio/min': 0.14413875341415405, 'sampling/importance_sampling_ratio/mean': 0.9999866485595703, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.276869723682466e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 597/1024 [26:22:09<20:33:08, 173.28s/it][AINFO 12-01 21:53:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:53:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:53:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:53:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 598/1024 [26:24:53<20:10:25, 170.48s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020483757834881544, 'learning_rate': 1e-05, 'num_tokens': 525491938.0, 'completions/mean_length': 7507.9609375, 'completions/min_length': 114.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7294.9365234375, 'completions/min_terminated_length': 114.0, 'completions/max_terminated_length': 16352.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3135228455066681, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.022341173142194748, 'sampling/sampling_logp_difference/max': 4.1009721755981445, 'sampling/importance_sampling_ratio/min': 0.016556572169065475, 'sampling/importance_sampling_ratio/mean': 0.9999257922172546, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.629508762514888e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 598/1024 [26:24:53<20:10:25, 170.48s/it][AINFO 12-01 21:56:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:56:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:56:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:56:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 58%|█████▊    | 599/1024 [26:27:37<19:53:33, 168.50s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.003444709349423647, 'learning_rate': 1e-05, 'num_tokens': 526463663.0, 'completions/mean_length': 7392.5390625, 'completions/min_length': 1717.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7176.744140625, 'completions/min_terminated_length': 1717.0, 'completions/max_terminated_length': 16003.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3543020486831665, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020350588485598564, 'sampling/sampling_logp_difference/max': 11.635331153869629, 'sampling/importance_sampling_ratio/min': 8.847893695929088e-06, 'sampling/importance_sampling_ratio/mean': 1.0000183582305908, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.058435681348783e-05, 'epoch': 0.55}
+
+ 58%|█████▊    | 599/1024 [26:27:37<19:53:33, 168.50s/it][AINFO 12-01 21:58:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:58:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:58:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 21:58:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▊    | 600/1024 [26:30:27<19:53:08, 168.84s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015880388673394918, 'learning_rate': 1e-05, 'num_tokens': 527571440.0, 'completions/mean_length': 8504.2578125, 'completions/min_length': 2012.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8250.072265625, 'completions/min_terminated_length': 2012.0, 'completions/max_terminated_length': 16262.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.30327799916267395, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020317042246460915, 'sampling/sampling_logp_difference/max': 2.3617703914642334, 'sampling/importance_sampling_ratio/min': 0.09425321221351624, 'sampling/importance_sampling_ratio/mean': 1.000022530555725, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.970160873403074e-05, 'epoch': 0.55}
+
+ 59%|█████▊    | 600/1024 [26:30:27<19:53:08, 168.84s/it][AINFO 12-01 22:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:01:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:01:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▊    | 601/1024 [26:33:27<20:15:04, 172.35s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019962217193096876, 'learning_rate': 1e-05, 'num_tokens': 528513544.0, 'completions/mean_length': 7196.0, 'completions/min_length': 1222.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6744.130859375, 'completions/min_terminated_length': 1222.0, 'completions/max_terminated_length': 16230.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.36691081523895264, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018917370587587357, 'sampling/sampling_logp_difference/max': 1.812622308731079, 'sampling/importance_sampling_ratio/min': 0.16322554647922516, 'sampling/importance_sampling_ratio/mean': 0.9999862313270569, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.160130326155922e-05, 'epoch': 0.55}
+
+ 59%|█████▊    | 601/1024 [26:33:27<20:15:04, 172.35s/it][AINFO 12-01 22:04:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:04:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:04:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:04:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▉    | 602/1024 [26:36:38<20:51:14, 177.90s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010357614373788238, 'learning_rate': 1e-05, 'num_tokens': 529467543.0, 'completions/mean_length': 7311.7421875, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7167.73828125, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 15665.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020512349903583527, 'sampling/sampling_logp_difference/max': 1.9824538230895996, 'sampling/importance_sampling_ratio/min': 0.13773085176944733, 'sampling/importance_sampling_ratio/mean': 1.0000078678131104, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.431116278487025e-05, 'epoch': 0.55}
+
+ 59%|█████▉    | 602/1024 [26:36:38<20:51:14, 177.90s/it][AINFO 12-01 22:07:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:07:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:07:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:07:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▉    | 603/1024 [26:39:13<19:59:49, 171.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0005855494528077543, 'learning_rate': 1e-05, 'num_tokens': 530356663.0, 'completions/mean_length': 6776.5, 'completions/min_length': 1029.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6700.8505859375, 'completions/min_terminated_length': 1029.0, 'completions/max_terminated_length': 16178.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.12756995856761932, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.020424578338861465, 'sampling/sampling_logp_difference/max': 4.669631004333496, 'sampling/importance_sampling_ratio/min': 0.00937572866678238, 'sampling/importance_sampling_ratio/mean': 1.0000473260879517, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.0827637342790695e-05, 'epoch': 0.55}
+
+ 59%|█████▉    | 603/1024 [26:39:13<19:59:49, 171.00s/it][AINFO 12-01 22:10:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:10:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:10:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:10:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▉    | 604/1024 [26:42:22<20:33:26, 176.21s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0015709311701357365, 'learning_rate': 1e-05, 'num_tokens': 531472531.0, 'completions/mean_length': 8569.21875, 'completions/min_length': 1010.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7978.18505859375, 'completions/min_terminated_length': 1010.0, 'completions/max_terminated_length': 15672.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3135277032852173, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020154882222414017, 'sampling/sampling_logp_difference/max': 9.75666618347168, 'sampling/importance_sampling_ratio/min': 5.7907349400920793e-05, 'sampling/importance_sampling_ratio/mean': 0.9999009370803833, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.587741432260373e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 604/1024 [26:42:22<20:33:26, 176.21s/it][AINFO 12-01 22:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:13:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:13:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▉    | 605/1024 [26:45:09<20:11:15, 173.45s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002377378987148404, 'learning_rate': 1e-05, 'num_tokens': 532392060.0, 'completions/mean_length': 7036.3203125, 'completions/min_length': 1050.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6734.7822265625, 'completions/min_terminated_length': 1050.0, 'completions/max_terminated_length': 16177.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021056536585092545, 'sampling/sampling_logp_difference/max': 3.5602424144744873, 'sampling/importance_sampling_ratio/min': 0.028431933373212814, 'sampling/importance_sampling_ratio/mean': 1.0000391006469727, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.547897242446197e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 605/1024 [26:45:09<20:11:15, 173.45s/it][AINFO 12-01 22:16:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:16:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:16:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:16:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 606/1024 [26:48:08<20:20:19, 175.17s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.000958350021392107, 'learning_rate': 1e-05, 'num_tokens': 533426241.0, 'completions/mean_length': 7923.7890625, 'completions/min_length': 879.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7720.744140625, 'completions/min_terminated_length': 879.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019714586436748505, 'sampling/sampling_logp_difference/max': 2.5728724002838135, 'sampling/importance_sampling_ratio/min': 0.07631602138280869, 'sampling/importance_sampling_ratio/mean': 0.9999751448631287, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2292672990006395e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 606/1024 [26:48:08<20:20:19, 175.17s/it][AINFO 12-01 22:19:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:19:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:19:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:19:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 59%|█████▉    | 607/1024 [26:51:24<21:01:22, 181.49s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020389275159686804, 'learning_rate': 1e-05, 'num_tokens': 534456384.0, 'completions/mean_length': 7899.1171875, 'completions/min_length': 1410.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7257.40380859375, 'completions/min_terminated_length': 1410.0, 'completions/max_terminated_length': 16361.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017849881201982498, 'sampling/sampling_logp_difference/max': 2.9765052795410156, 'sampling/importance_sampling_ratio/min': 0.0509706512093544, 'sampling/importance_sampling_ratio/mean': 1.000032901763916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.213034589563904e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 607/1024 [26:51:24<21:01:22, 181.49s/it][AINFO 12-01 22:22:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:22:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:22:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:22:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▉    | 608/1024 [26:54:19<20:44:10, 179.45s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013492320431396365, 'learning_rate': 1e-05, 'num_tokens': 535390069.0, 'completions/mean_length': 7125.0390625, 'completions/min_length': 952.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6978.07177734375, 'completions/min_terminated_length': 952.0, 'completions/max_terminated_length': 15557.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020903976634144783, 'sampling/sampling_logp_difference/max': 2.112229347229004, 'sampling/importance_sampling_ratio/min': 0.12096798419952393, 'sampling/importance_sampling_ratio/mean': 1.0000267028808594, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.057708858657861e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 608/1024 [26:54:19<20:44:10, 179.45s/it][AINFO 12-01 22:25:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:25:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:25:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:25:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 59%|█████▉    | 609/1024 [26:57:20<20:44:39, 179.95s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001709628733806312, 'learning_rate': 1e-05, 'num_tokens': 536427992.0, 'completions/mean_length': 7967.6484375, 'completions/min_length': 1171.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7696.15283203125, 'completions/min_terminated_length': 1171.0, 'completions/max_terminated_length': 15319.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01936802640557289, 'sampling/sampling_logp_difference/max': 1.4338665008544922, 'sampling/importance_sampling_ratio/min': 0.2383854240179062, 'sampling/importance_sampling_ratio/mean': 0.9999380707740784, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3375986933824606e-05, 'epoch': 0.56}
+
+ 59%|█████▉    | 609/1024 [26:57:20<20:44:39, 179.95s/it][AINFO 12-01 22:28:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:28:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:28:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:28:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 60%|█████▉    | 610/1024 [27:00:21<20:45:02, 180.44s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010841956827789545, 'learning_rate': 1e-05, 'num_tokens': 537491661.0, 'completions/mean_length': 8123.9765625, 'completions/min_length': 885.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7857.52392578125, 'completions/min_terminated_length': 885.0, 'completions/max_terminated_length': 16121.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.22461043298244476, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020965993404388428, 'sampling/sampling_logp_difference/max': 3.107165813446045, 'sampling/importance_sampling_ratio/min': 0.044727545231580734, 'sampling/importance_sampling_ratio/mean': 0.9999263286590576, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.966484402757487e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 610/1024 [27:00:21<20:45:02, 180.44s/it][AINFO 12-01 22:31:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:31:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:31:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:31:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 60%|█████▉    | 611/1024 [27:03:24<20:46:39, 181.11s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007988002616912127, 'learning_rate': 1e-05, 'num_tokens': 538562926.0, 'completions/mean_length': 8236.3203125, 'completions/min_length': 642.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7693.14208984375, 'completions/min_terminated_length': 642.0, 'completions/max_terminated_length': 16016.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.13098980486392975, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.020337127149105072, 'sampling/sampling_logp_difference/max': 8.792131423950195, 'sampling/importance_sampling_ratio/min': 0.00015192379942163825, 'sampling/importance_sampling_ratio/mean': 1.0000181198120117, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.8863173863792326e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 611/1024 [27:03:24<20:46:39, 181.11s/it][AINFO 12-01 22:34:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:34:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:34:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:34:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 60%|█████▉    | 612/1024 [27:06:38<21:10:20, 185.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017065460560843349, 'learning_rate': 1e-05, 'num_tokens': 539737801.0, 'completions/mean_length': 9032.3359375, 'completions/min_length': 1979.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8733.4873046875, 'completions/min_terminated_length': 1979.0, 'completions/max_terminated_length': 16232.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2301519513130188, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019908349961042404, 'sampling/sampling_logp_difference/max': 1.6481356620788574, 'sampling/importance_sampling_ratio/min': 0.19240829348564148, 'sampling/importance_sampling_ratio/mean': 1.0001003742218018, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.2575629006714735e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 612/1024 [27:06:38<21:10:20, 185.00s/it][AINFO 12-01 22:37:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:37:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:37:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:37:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|█████▉    | 613/1024 [27:09:34<20:49:25, 182.40s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0014859962975606322, 'learning_rate': 1e-05, 'num_tokens': 540642797.0, 'completions/mean_length': 6914.34375, 'completions/min_length': 937.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6608.87060546875, 'completions/min_terminated_length': 937.0, 'completions/max_terminated_length': 15831.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2477683573961258, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018462607637047768, 'sampling/sampling_logp_difference/max': 5.818559169769287, 'sampling/importance_sampling_ratio/min': 0.0029718840960413218, 'sampling/importance_sampling_ratio/mean': 0.9999843239784241, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.247635061143228e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 613/1024 [27:09:34<20:49:25, 182.40s/it][AINFO 12-01 22:40:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:40:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:40:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:40:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|█████▉    | 614/1024 [27:12:33<20:38:48, 181.29s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0024468135088682175, 'learning_rate': 1e-05, 'num_tokens': 541735621.0, 'completions/mean_length': 8381.0, 'completions/min_length': 1537.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7628.58154296875, 'completions/min_terminated_length': 1537.0, 'completions/max_terminated_length': 16092.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.23857945203781128, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021267445757985115, 'sampling/sampling_logp_difference/max': 2.491960287094116, 'sampling/importance_sampling_ratio/min': 0.08274760097265244, 'sampling/importance_sampling_ratio/mean': 0.9999700784683228, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.77945600575913e-05, 'epoch': 0.56}
+
+ 60%|█████▉    | 614/1024 [27:12:33<20:38:48, 181.29s/it][AINFO 12-01 22:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:43:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:43:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 615/1024 [27:15:24<20:13:44, 178.05s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001426701433956623, 'learning_rate': 1e-05, 'num_tokens': 542865918.0, 'completions/mean_length': 8668.6328125, 'completions/min_length': 792.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8546.1669921875, 'completions/min_terminated_length': 792.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.24541424214839935, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022376585751771927, 'sampling/sampling_logp_difference/max': 4.4621124267578125, 'sampling/importance_sampling_ratio/min': 0.011537964455783367, 'sampling/importance_sampling_ratio/mean': 0.9999186396598816, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.538752509437472e-05, 'epoch': 0.57}
+
+ 60%|██████    | 615/1024 [27:15:24<20:13:44, 178.05s/it][AINFO 12-01 22:46:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:46:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:46:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:46:40 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 616/1024 [27:18:15<19:56:41, 175.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018925584154203534, 'learning_rate': 1e-05, 'num_tokens': 543790173.0, 'completions/mean_length': 7059.0546875, 'completions/min_length': 1343.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6679.99169921875, 'completions/min_terminated_length': 1343.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018869083374738693, 'sampling/sampling_logp_difference/max': 2.306563377380371, 'sampling/importance_sampling_ratio/min': 0.09960296005010605, 'sampling/importance_sampling_ratio/mean': 0.9999918937683105, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.8422301006212365e-05, 'epoch': 0.57}
+
+ 60%|██████    | 616/1024 [27:18:15<19:56:41, 175.98s/it][AINFO 12-01 22:49:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:49:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:49:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:49:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 60%|██████    | 617/1024 [27:21:11<19:54:56, 176.16s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018952018581330776, 'learning_rate': 1e-05, 'num_tokens': 544879295.0, 'completions/mean_length': 8360.015625, 'completions/min_length': 1389.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8232.6513671875, 'completions/min_terminated_length': 1389.0, 'completions/max_terminated_length': 16334.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.20699402689933777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02195816859602928, 'sampling/sampling_logp_difference/max': 2.3631234169006348, 'sampling/importance_sampling_ratio/min': 0.09412577003240585, 'sampling/importance_sampling_ratio/mean': 0.9999762773513794, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.197831933401176e-05, 'epoch': 0.57}
+
+ 60%|██████    | 617/1024 [27:21:11<19:54:56, 176.16s/it][AINFO 12-01 22:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:52:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 60%|██████    | 618/1024 [27:23:57<19:30:48, 173.03s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015583160566166043, 'learning_rate': 1e-05, 'num_tokens': 545781269.0, 'completions/mean_length': 6908.859375, 'completions/min_length': 747.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6681.45654296875, 'completions/min_terminated_length': 747.0, 'completions/max_terminated_length': 15863.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3095887303352356, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020639434456825256, 'sampling/sampling_logp_difference/max': 3.2231688499450684, 'sampling/importance_sampling_ratio/min': 0.039828646928071976, 'sampling/importance_sampling_ratio/mean': 1.0000723600387573, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1856849722753395e-05, 'epoch': 0.57}
+
+ 60%|██████    | 618/1024 [27:23:57<19:30:48, 173.03s/it][AINFO 12-01 22:55:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:55:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:55:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:55:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 60%|██████    | 619/1024 [27:26:30<18:47:06, 166.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001624589436687529, 'learning_rate': 1e-05, 'num_tokens': 546452456.0, 'completions/mean_length': 5066.0234375, 'completions/min_length': 815.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4976.9052734375, 'completions/min_terminated_length': 815.0, 'completions/max_terminated_length': 16172.0, 'rewards/accuracy_reward/mean': 0.65625, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.65625, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01701224222779274, 'sampling/sampling_logp_difference/max': 1.541041374206543, 'sampling/importance_sampling_ratio/min': 0.21415796875953674, 'sampling/importance_sampling_ratio/mean': 1.0000022649765015, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.061554756162877e-05, 'epoch': 0.57}
+
+ 60%|██████    | 619/1024 [27:26:30<18:47:06, 166.98s/it][AINFO 12-01 22:57:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:57:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:57:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 22:57:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████    | 620/1024 [27:29:04<18:18:51, 163.20s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016262276330962777, 'learning_rate': 1e-05, 'num_tokens': 547324868.0, 'completions/mean_length': 6672.15625, 'completions/min_length': 993.0, 'completions/max_length': 15417.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6672.15625, 'completions/min_terminated_length': 993.0, 'completions/max_terminated_length': 15417.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2880108058452606, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019024454057216644, 'sampling/sampling_logp_difference/max': 6.999964237213135, 'sampling/importance_sampling_ratio/min': 0.0009119146270677447, 'sampling/importance_sampling_ratio/mean': 1.0000323057174683, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6028416111075785e-05, 'epoch': 0.57}
+
+ 61%|██████    | 620/1024 [27:29:04<18:18:51, 163.20s/it][AINFO 12-01 23:00:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:00:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:00:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:00:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████    | 621/1024 [27:32:03<18:46:21, 167.70s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0021924057509750128, 'learning_rate': 1e-05, 'num_tokens': 548446836.0, 'completions/mean_length': 8605.5, 'completions/min_length': 1557.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 8017.21044921875, 'completions/min_terminated_length': 1557.0, 'completions/max_terminated_length': 15977.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2880108058452606, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01956101320683956, 'sampling/sampling_logp_difference/max': 4.94474458694458, 'sampling/importance_sampling_ratio/min': 0.0071207331493496895, 'sampling/importance_sampling_ratio/mean': 0.9999799728393555, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.763149622202036e-05, 'epoch': 0.57}
+
+ 61%|██████    | 621/1024 [27:32:03<18:46:21, 167.70s/it][AINFO 12-01 23:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:03:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████    | 622/1024 [27:34:29<18:01:20, 161.39s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001583328004926443, 'learning_rate': 1e-05, 'num_tokens': 549280921.0, 'completions/mean_length': 6373.9140625, 'completions/min_length': 564.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6133.67236328125, 'completions/min_terminated_length': 564.0, 'completions/max_terminated_length': 15910.0, 'rewards/accuracy_reward/mean': 0.6953125, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.6953125, 'reward_std': 0.24541422724723816, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019507236778736115, 'sampling/sampling_logp_difference/max': 7.670132637023926, 'sampling/importance_sampling_ratio/min': 0.0004665559681598097, 'sampling/importance_sampling_ratio/mean': 0.9999409317970276, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.2999010159073805e-05, 'epoch': 0.57}
+
+ 61%|██████    | 622/1024 [27:34:29<18:01:20, 161.39s/it][AINFO 12-01 23:05:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:05:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:05:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:05:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████    | 623/1024 [27:37:29<18:35:22, 166.89s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015107174403965473, 'learning_rate': 1e-05, 'num_tokens': 550280132.0, 'completions/mean_length': 7658.3984375, 'completions/min_length': 1187.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7376.92724609375, 'completions/min_terminated_length': 1187.0, 'completions/max_terminated_length': 16202.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.21595829725265503, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02035154402256012, 'sampling/sampling_logp_difference/max': 2.1989099979400635, 'sampling/importance_sampling_ratio/min': 0.11092399805784225, 'sampling/importance_sampling_ratio/mean': 1.0000554323196411, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5113492458549445e-05, 'epoch': 0.57}
+
+ 61%|██████    | 623/1024 [27:37:29<18:35:22, 166.89s/it][AINFO 12-01 23:08:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:08:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:08:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:08:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████    | 624/1024 [27:40:24<18:48:08, 169.22s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.0024372541811317205, 'learning_rate': 1e-05, 'num_tokens': 551191491.0, 'completions/mean_length': 6963.6796875, 'completions/min_length': 1048.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6737.59228515625, 'completions/min_terminated_length': 1048.0, 'completions/max_terminated_length': 15242.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020302029326558113, 'sampling/sampling_logp_difference/max': 1.9107463359832764, 'sampling/importance_sampling_ratio/min': 0.14796990156173706, 'sampling/importance_sampling_ratio/mean': 0.9999798536300659, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.42168414704247e-05, 'epoch': 0.57}
+
+ 61%|██████    | 624/1024 [27:40:24<18:48:08, 169.22s/it][AINFO 12-01 23:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:11:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:11:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████    | 625/1024 [27:43:19<18:57:51, 171.11s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018845387967303395, 'learning_rate': 1e-05, 'num_tokens': 552190583.0, 'completions/mean_length': 7665.71875, 'completions/min_length': 1202.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7456.48046875, 'completions/min_terminated_length': 1202.0, 'completions/max_terminated_length': 15946.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.20357416570186615, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02094152383506298, 'sampling/sampling_logp_difference/max': 2.8640003204345703, 'sampling/importance_sampling_ratio/min': 0.057040125131607056, 'sampling/importance_sampling_ratio/mean': 0.9999953508377075, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.465714735284564e-05, 'epoch': 0.57}
+
+ 61%|██████    | 625/1024 [27:43:19<18:57:51, 171.11s/it][AINFO 12-01 23:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:14:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████    | 626/1024 [27:46:17<19:08:17, 173.11s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018039672868326306, 'learning_rate': 1e-05, 'num_tokens': 553308559.0, 'completions/mean_length': 8575.0, 'completions/min_length': 1328.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 8387.583984375, 'completions/min_terminated_length': 1328.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.17358636856079102, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020615071058273315, 'sampling/sampling_logp_difference/max': 9.28635311126709, 'sampling/importance_sampling_ratio/min': 9.26804423215799e-05, 'sampling/importance_sampling_ratio/mean': 1.000011682510376, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.4703750429798674e-05, 'epoch': 0.58}
+
+ 61%|██████    | 626/1024 [27:46:17<19:08:17, 173.11s/it][AINFO 12-01 23:17:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:17:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:17:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:17:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████    | 627/1024 [27:49:27<19:39:13, 178.22s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002128654159605503, 'learning_rate': 1e-05, 'num_tokens': 554337105.0, 'completions/mean_length': 7894.265625, 'completions/min_length': 540.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7549.154296875, 'completions/min_terminated_length': 540.0, 'completions/max_terminated_length': 15932.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2919674217700958, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02009463496506214, 'sampling/sampling_logp_difference/max': 9.747733116149902, 'sampling/importance_sampling_ratio/min': 5.84269619139377e-05, 'sampling/importance_sampling_ratio/mean': 1.0000321865081787, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.379705496328825e-05, 'epoch': 0.58}
+
+ 61%|██████    | 627/1024 [27:49:27<19:39:13, 178.22s/it][AINFO 12-01 23:20:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:20:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:20:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:20:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 61%|██████▏   | 628/1024 [27:52:29<19:44:32, 179.48s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014615607215091586, 'learning_rate': 1e-05, 'num_tokens': 555253790.0, 'completions/mean_length': 7010.7890625, 'completions/min_length': 1198.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6301.89111328125, 'completions/min_terminated_length': 1198.0, 'completions/max_terminated_length': 15524.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.20069602131843567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020959876477718353, 'sampling/sampling_logp_difference/max': 5.0238189697265625, 'sampling/importance_sampling_ratio/min': 0.006579352542757988, 'sampling/importance_sampling_ratio/mean': 1.000049352645874, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.0814046062441776e-05, 'epoch': 0.58}
+
+ 61%|██████▏   | 628/1024 [27:52:29<19:44:32, 179.48s/it][AINFO 12-01 23:23:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:23:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:23:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:23:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 61%|██████▏   | 629/1024 [27:55:20<19:22:59, 176.66s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0015409741317853332, 'learning_rate': 1e-05, 'num_tokens': 556204742.0, 'completions/mean_length': 7285.1875, 'completions/min_length': 875.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7066.81640625, 'completions/min_terminated_length': 875.0, 'completions/max_terminated_length': 15118.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02075924351811409, 'sampling/sampling_logp_difference/max': 1.9338470697402954, 'sampling/importance_sampling_ratio/min': 0.14459088444709778, 'sampling/importance_sampling_ratio/mean': 1.0000330209732056, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.079376084424439e-05, 'epoch': 0.58}
+
+ 61%|██████▏   | 629/1024 [27:55:20<19:22:59, 176.66s/it][AINFO 12-01 23:26:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:26:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:26:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:26:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 630/1024 [27:58:22<19:31:21, 178.38s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015791176119819283, 'learning_rate': 1e-05, 'num_tokens': 557303331.0, 'completions/mean_length': 8408.7890625, 'completions/min_length': 1746.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7732.923828125, 'completions/min_terminated_length': 1746.0, 'completions/max_terminated_length': 15576.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.16675157845020294, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02041950821876526, 'sampling/sampling_logp_difference/max': 2.3329830169677734, 'sampling/importance_sampling_ratio/min': 0.0970059409737587, 'sampling/importance_sampling_ratio/mean': 0.9999826550483704, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.5051369220818742e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 630/1024 [27:58:22<19:31:21, 178.38s/it][AINFO 12-01 23:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:29:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:29:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 631/1024 [28:01:22<19:31:05, 178.79s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0006929045775905252, 'learning_rate': 1e-05, 'num_tokens': 558396345.0, 'completions/mean_length': 8401.671875, 'completions/min_length': 1267.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7939.8837890625, 'completions/min_terminated_length': 1267.0, 'completions/max_terminated_length': 15977.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01959926262497902, 'sampling/sampling_logp_difference/max': 3.1952662467956543, 'sampling/importance_sampling_ratio/min': 0.040955618023872375, 'sampling/importance_sampling_ratio/mean': 0.9999886155128479, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.7471621106324164e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 631/1024 [28:01:22<19:31:05, 178.79s/it][AINFO 12-01 23:32:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:32:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:32:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:32:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 632/1024 [28:04:07<19:02:39, 174.90s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018388873431831598, 'learning_rate': 1e-05, 'num_tokens': 559434112.0, 'completions/mean_length': 7954.6171875, 'completions/min_length': 2124.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7466.966796875, 'completions/min_terminated_length': 2124.0, 'completions/max_terminated_length': 15442.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017919518053531647, 'sampling/sampling_logp_difference/max': 2.243633508682251, 'sampling/importance_sampling_ratio/min': 0.10607238858938217, 'sampling/importance_sampling_ratio/mean': 0.9999477863311768, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3909869166091084e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 632/1024 [28:04:07<19:02:39, 174.90s/it][AINFO 12-01 23:35:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:35:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:35:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:35:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 633/1024 [28:07:12<19:18:19, 177.75s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014082720736041665, 'learning_rate': 1e-05, 'num_tokens': 560395715.0, 'completions/mean_length': 7356.4609375, 'completions/min_length': 1161.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6912.4833984375, 'completions/min_terminated_length': 1161.0, 'completions/max_terminated_length': 16011.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019914565607905388, 'sampling/sampling_logp_difference/max': 2.57027530670166, 'sampling/importance_sampling_ratio/min': 0.07651448249816895, 'sampling/importance_sampling_ratio/mean': 0.9999120235443115, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.613246205233736e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 633/1024 [28:07:12<19:18:19, 177.75s/it][AINFO 12-01 23:38:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:38:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:38:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:38:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 634/1024 [28:10:00<18:57:11, 174.95s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018412945792078972, 'learning_rate': 1e-05, 'num_tokens': 561419523.0, 'completions/mean_length': 7819.1875, 'completions/min_length': 1327.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7613.63232421875, 'completions/min_terminated_length': 1327.0, 'completions/max_terminated_length': 16316.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3553628921508789, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.0199904702603817, 'sampling/sampling_logp_difference/max': 1.743659257888794, 'sampling/importance_sampling_ratio/min': 0.1748792976140976, 'sampling/importance_sampling_ratio/mean': 1.0000648498535156, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.254311026372307e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 634/1024 [28:10:00<18:57:11, 174.95s/it][AINFO 12-01 23:41:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:41:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:41:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:41:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 635/1024 [28:12:46<18:36:05, 172.15s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0029482271056622267, 'learning_rate': 1e-05, 'num_tokens': 562337583.0, 'completions/mean_length': 6996.84375, 'completions/min_length': 1007.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6694.0322265625, 'completions/min_terminated_length': 1007.0, 'completions/max_terminated_length': 16231.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2880156934261322, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01967364363372326, 'sampling/sampling_logp_difference/max': 2.6062328815460205, 'sampling/importance_sampling_ratio/min': 0.0738120824098587, 'sampling/importance_sampling_ratio/mean': 0.9999892711639404, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6633710212518054e-05, 'epoch': 0.58}
+
+ 62%|██████▏   | 635/1024 [28:12:46<18:36:05, 172.15s/it][AINFO 12-01 23:44:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:44:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:44:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:44:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 636/1024 [28:15:45<18:46:02, 174.13s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0005914675421081483, 'learning_rate': 1e-05, 'num_tokens': 563203522.0, 'completions/mean_length': 6616.6484375, 'completions/min_length': 472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6051.5947265625, 'completions/min_terminated_length': 472.0, 'completions/max_terminated_length': 15942.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.18201877176761627, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01669927127659321, 'sampling/sampling_logp_difference/max': 1.8692678213119507, 'sampling/importance_sampling_ratio/min': 0.1542365550994873, 'sampling/importance_sampling_ratio/mean': 0.9999228715896606, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.759195394399285e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 636/1024 [28:15:45<18:46:02, 174.13s/it][AINFO 12-01 23:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:47:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:47:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▏   | 637/1024 [28:18:41<18:47:24, 174.79s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019180101808160543, 'learning_rate': 1e-05, 'num_tokens': 564265276.0, 'completions/mean_length': 8161.515625, 'completions/min_length': 1680.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7827.26806640625, 'completions/min_terminated_length': 1680.0, 'completions/max_terminated_length': 16271.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.22567614912986755, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020116018131375313, 'sampling/sampling_logp_difference/max': 2.8175625801086426, 'sampling/importance_sampling_ratio/min': 0.05975140631198883, 'sampling/importance_sampling_ratio/mean': 0.9999512434005737, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.140271039432264e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 637/1024 [28:18:41<18:47:24, 174.79s/it][AINFO 12-01 23:49:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:49:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:49:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:49:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 638/1024 [28:21:46<19:05:01, 177.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019858807791024446, 'learning_rate': 1e-05, 'num_tokens': 565435047.0, 'completions/mean_length': 9007.4609375, 'completions/min_length': 986.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8707.6015625, 'completions/min_terminated_length': 986.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.24040167033672333, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020806897431612015, 'sampling/sampling_logp_difference/max': 3.3683948516845703, 'sampling/importance_sampling_ratio/min': 0.03444488346576691, 'sampling/importance_sampling_ratio/mean': 0.9999405145645142, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.2880104501819005e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 638/1024 [28:21:46<19:05:01, 177.98s/it][AINFO 12-01 23:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:53:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 62%|██████▏   | 639/1024 [28:24:43<18:58:37, 177.45s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018036456312984228, 'learning_rate': 1e-05, 'num_tokens': 566413757.0, 'completions/mean_length': 7495.796875, 'completions/min_length': 2085.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7282.48046875, 'completions/min_terminated_length': 2085.0, 'completions/max_terminated_length': 15026.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.22065868973731995, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020383328199386597, 'sampling/sampling_logp_difference/max': 3.548797130584717, 'sampling/importance_sampling_ratio/min': 0.028759213164448738, 'sampling/importance_sampling_ratio/mean': 1.0000522136688232, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5557812338993244e-05, 'epoch': 0.59}
+
+ 62%|██████▏   | 639/1024 [28:24:43<18:58:37, 177.45s/it][AINFO 12-01 23:55:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:55:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:55:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:55:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 62%|██████▎   | 640/1024 [28:27:32<18:39:47, 174.97s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022391905076801777, 'learning_rate': 1e-05, 'num_tokens': 567387744.0, 'completions/mean_length': 7481.9609375, 'completions/min_length': 1185.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7120.08935546875, 'completions/min_terminated_length': 1185.0, 'completions/max_terminated_length': 16247.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0199461467564106, 'sampling/sampling_logp_difference/max': 2.816622734069824, 'sampling/importance_sampling_ratio/min': 0.059807587414979935, 'sampling/importance_sampling_ratio/mean': 0.9999896287918091, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2682909881696105e-05, 'epoch': 0.59}
+
+ 62%|██████▎   | 640/1024 [28:27:32<18:39:47, 174.97s/it][AINFO 12-01 23:58:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:58:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:58:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-01 23:58:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 641/1024 [28:30:24<18:31:16, 174.09s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009802818531170487, 'learning_rate': 1e-05, 'num_tokens': 568312192.0, 'completions/mean_length': 7058.5625, 'completions/min_length': 849.0, 'completions/max_length': 15681.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7058.5625, 'completions/min_terminated_length': 849.0, 'completions/max_terminated_length': 15681.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.21488474309444427, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020550437271595, 'sampling/sampling_logp_difference/max': 1.5561962127685547, 'sampling/importance_sampling_ratio/min': 0.21093690395355225, 'sampling/importance_sampling_ratio/mean': 1.00002121925354, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.1201577055762755e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 641/1024 [28:30:24<18:31:16, 174.09s/it][AINFO 12-02 00:01:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:01:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:01:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:01:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 642/1024 [28:33:09<18:10:39, 171.31s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007734656101092696, 'learning_rate': 1e-05, 'num_tokens': 569254406.0, 'completions/mean_length': 7205.546875, 'completions/min_length': 954.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7059.857421875, 'completions/min_terminated_length': 954.0, 'completions/max_terminated_length': 15735.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.18884867429733276, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021801365539431572, 'sampling/sampling_logp_difference/max': 1.36427640914917, 'sampling/importance_sampling_ratio/min': 0.2757047712802887, 'sampling/importance_sampling_ratio/mean': 0.9999790787696838, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.0715216464850528e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 642/1024 [28:33:09<18:10:39, 171.31s/it][AINFO 12-02 00:04:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:04:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:04:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:04:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 643/1024 [28:36:37<19:18:40, 182.47s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009871537331491709, 'learning_rate': 1e-05, 'num_tokens': 570338494.0, 'completions/mean_length': 8310.875, 'completions/min_length': 703.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7982.69873046875, 'completions/min_terminated_length': 703.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020081112161278725, 'sampling/sampling_logp_difference/max': 3.062197685241699, 'sampling/importance_sampling_ratio/min': 0.04678476229310036, 'sampling/importance_sampling_ratio/mean': 0.9999879002571106, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.15036419099124e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 643/1024 [28:36:37<19:18:40, 182.47s/it][AINFO 12-02 00:07:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:07:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:07:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:07:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 644/1024 [28:39:34<19:05:42, 180.90s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001943186391144991, 'learning_rate': 1e-05, 'num_tokens': 571344899.0, 'completions/mean_length': 7703.6015625, 'completions/min_length': 908.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7565.81787109375, 'completions/min_terminated_length': 908.0, 'completions/max_terminated_length': 15956.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.1354655921459198, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.022189373150467873, 'sampling/sampling_logp_difference/max': 2.5386276245117188, 'sampling/importance_sampling_ratio/min': 0.07897470891475677, 'sampling/importance_sampling_ratio/mean': 1.0000431537628174, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.816606447846425e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 644/1024 [28:39:34<19:05:42, 180.90s/it][AINFO 12-02 00:10:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:10:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:10:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:10:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 645/1024 [28:42:14<18:21:32, 174.39s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020338839385658503, 'learning_rate': 1e-05, 'num_tokens': 572351122.0, 'completions/mean_length': 7705.3671875, 'completions/min_length': 1854.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7425.4111328125, 'completions/min_terminated_length': 1854.0, 'completions/max_terminated_length': 15603.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3182408809661865, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01907338947057724, 'sampling/sampling_logp_difference/max': 1.457681655883789, 'sampling/importance_sampling_ratio/min': 0.23277530074119568, 'sampling/importance_sampling_ratio/mean': 0.9999645948410034, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.678705133504991e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 645/1024 [28:42:14<18:21:32, 174.39s/it][AINFO 12-02 00:13:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:13:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 646/1024 [28:45:04<18:11:13, 173.21s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002281014109030366, 'learning_rate': 1e-05, 'num_tokens': 573328375.0, 'completions/mean_length': 7484.2890625, 'completions/min_length': 911.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7197.201171875, 'completions/min_terminated_length': 911.0, 'completions/max_terminated_length': 15759.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01928749307990074, 'sampling/sampling_logp_difference/max': 9.93208122253418, 'sampling/importance_sampling_ratio/min': 4.8590562073513865e-05, 'sampling/importance_sampling_ratio/mean': 1.0000287294387817, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.664966195377929e-05, 'epoch': 0.59}
+
+ 63%|██████▎   | 646/1024 [28:45:04<18:11:13, 173.21s/it][AINFO 12-02 00:16:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:16:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:16:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:16:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 647/1024 [28:48:11<18:33:58, 177.29s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0033873359207063913, 'learning_rate': 1e-05, 'num_tokens': 574350877.0, 'completions/mean_length': 7808.234375, 'completions/min_length': 707.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7159.71484375, 'completions/min_terminated_length': 707.0, 'completions/max_terminated_length': 16049.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.19097033143043518, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020555756986141205, 'sampling/sampling_logp_difference/max': 6.073037147521973, 'sampling/importance_sampling_ratio/min': 0.00230416445992887, 'sampling/importance_sampling_ratio/mean': 1.0000178813934326, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0048887310840655e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 647/1024 [28:48:11<18:33:58, 177.29s/it][AINFO 12-02 00:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:19:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:19:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 63%|██████▎   | 648/1024 [28:51:35<19:20:46, 185.23s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019415151327848434, 'learning_rate': 1e-05, 'num_tokens': 575494335.0, 'completions/mean_length': 8777.703125, 'completions/min_length': 1679.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 7843.5966796875, 'completions/min_terminated_length': 1679.0, 'completions/max_terminated_length': 15665.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.14123955368995667, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.020342033356428146, 'sampling/sampling_logp_difference/max': 7.654606342315674, 'sampling/importance_sampling_ratio/min': 0.00047385634388774633, 'sampling/importance_sampling_ratio/mean': 0.9999974370002747, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.5816251365995413e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 648/1024 [28:51:35<19:20:46, 185.23s/it][AINFO 12-02 00:22:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:22:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:22:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:22:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 649/1024 [28:54:41<19:19:00, 185.44s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016600601375102997, 'learning_rate': 1e-05, 'num_tokens': 576679758.0, 'completions/mean_length': 9117.2421875, 'completions/min_length': 1652.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 8696.8505859375, 'completions/min_terminated_length': 1652.0, 'completions/max_terminated_length': 16250.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.24541424214839935, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02224336378276348, 'sampling/sampling_logp_difference/max': 3.7605481147766113, 'sampling/importance_sampling_ratio/min': 0.02327098324894905, 'sampling/importance_sampling_ratio/mean': 1.0000677108764648, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.843432040819607e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 649/1024 [28:54:41<19:19:00, 185.44s/it][AINFO 12-02 00:25:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:25:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:25:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:25:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 63%|██████▎   | 650/1024 [28:57:36<18:57:12, 182.44s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015090813394635916, 'learning_rate': 1e-05, 'num_tokens': 577665155.0, 'completions/mean_length': 7547.7890625, 'completions/min_length': 900.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7188.59326171875, 'completions/min_terminated_length': 900.0, 'completions/max_terminated_length': 14768.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2498900145292282, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021169301122426987, 'sampling/sampling_logp_difference/max': 2.609931707382202, 'sampling/importance_sampling_ratio/min': 0.07353956997394562, 'sampling/importance_sampling_ratio/mean': 1.0000258684158325, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.186287048923987e-05, 'epoch': 0.6}
+
+ 63%|██████▎   | 650/1024 [28:57:36<18:57:12, 182.44s/it][AINFO 12-02 00:28:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:28:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:28:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:28:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▎   | 651/1024 [29:00:28<18:35:10, 179.38s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0014943976420909166, 'learning_rate': 1e-05, 'num_tokens': 578732618.0, 'completions/mean_length': 8200.7421875, 'completions/min_length': 1734.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7798.28662109375, 'completions/min_terminated_length': 1734.0, 'completions/max_terminated_length': 15762.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.23068872094154358, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020222939550876617, 'sampling/sampling_logp_difference/max': 1.9532665014266968, 'sampling/importance_sampling_ratio/min': 0.1418100893497467, 'sampling/importance_sampling_ratio/mean': 0.9999547004699707, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.699750070358277e-05, 'epoch': 0.6}
+
+ 64%|██████▎   | 651/1024 [29:00:28<18:35:10, 179.38s/it][AINFO 12-02 00:31:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:31:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:31:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:31:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▎   | 652/1024 [29:03:28<18:32:41, 179.47s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001338844420388341, 'learning_rate': 1e-05, 'num_tokens': 579780393.0, 'completions/mean_length': 8044.9296875, 'completions/min_length': 569.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7912.56396484375, 'completions/min_terminated_length': 569.0, 'completions/max_terminated_length': 15127.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.18253791332244873, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019809681922197342, 'sampling/sampling_logp_difference/max': 4.916159629821777, 'sampling/importance_sampling_ratio/min': 0.0073272162117064, 'sampling/importance_sampling_ratio/mean': 1.0000224113464355, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.862500824425297e-05, 'epoch': 0.6}
+
+ 64%|██████▎   | 652/1024 [29:03:28<18:32:41, 179.47s/it][AINFO 12-02 00:34:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:34:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:34:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:34:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▍   | 653/1024 [29:06:08<17:53:06, 173.55s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017291874391958117, 'learning_rate': 1e-05, 'num_tokens': 580728333.0, 'completions/mean_length': 7255.40625, 'completions/min_length': 1402.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7110.50830078125, 'completions/min_terminated_length': 1402.0, 'completions/max_terminated_length': 16052.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3753383159637451, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.021061915904283524, 'sampling/sampling_logp_difference/max': 2.4991037845611572, 'sampling/importance_sampling_ratio/min': 0.08215859532356262, 'sampling/importance_sampling_ratio/mean': 1.000056266784668, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1950288252555765e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 653/1024 [29:06:08<17:53:06, 173.55s/it][AINFO 12-02 00:37:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:37:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▍   | 654/1024 [29:08:59<17:46:47, 172.99s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016851407708600163, 'learning_rate': 1e-05, 'num_tokens': 581841176.0, 'completions/mean_length': 8521.2109375, 'completions/min_length': 1721.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 8459.298828125, 'completions/min_terminated_length': 1721.0, 'completions/max_terminated_length': 15798.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.23304283618927002, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022069508209824562, 'sampling/sampling_logp_difference/max': 3.704716205596924, 'sampling/importance_sampling_ratio/min': 0.024607202038168907, 'sampling/importance_sampling_ratio/mean': 1.0000420808792114, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6323204287546105e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 654/1024 [29:08:59<17:46:47, 172.99s/it][AINFO 12-02 00:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:40:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▍   | 655/1024 [29:12:04<18:05:35, 176.52s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022014116402715445, 'learning_rate': 1e-05, 'num_tokens': 582907112.0, 'completions/mean_length': 8157.9375, 'completions/min_length': 1141.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7960.51220703125, 'completions/min_terminated_length': 1141.0, 'completions/max_terminated_length': 15914.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.30168038606643677, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020279299467802048, 'sampling/sampling_logp_difference/max': 1.8028547763824463, 'sampling/importance_sampling_ratio/min': 0.17936845123767853, 'sampling/importance_sampling_ratio/mean': 1.0000040531158447, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.457011613747454e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 655/1024 [29:12:04<18:05:35, 176.52s/it][AINFO 12-02 00:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:43:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:43:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 64%|██████▍   | 656/1024 [29:15:20<18:38:03, 182.29s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001535851159133017, 'learning_rate': 1e-05, 'num_tokens': 583880882.0, 'completions/mean_length': 7454.890625, 'completions/min_length': 870.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6938.330078125, 'completions/min_terminated_length': 870.0, 'completions/max_terminated_length': 16062.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.17464719712734222, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020228832960128784, 'sampling/sampling_logp_difference/max': 2.6742470264434814, 'sampling/importance_sampling_ratio/min': 0.06895873695611954, 'sampling/importance_sampling_ratio/mean': 1.0000348091125488, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.079384586930246e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 656/1024 [29:15:20<18:38:03, 182.29s/it][AINFO 12-02 00:46:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:46:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:46:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:46:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▍   | 657/1024 [29:18:08<18:08:53, 178.02s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018130730604752898, 'learning_rate': 1e-05, 'num_tokens': 584799867.0, 'completions/mean_length': 7037.3203125, 'completions/min_length': 902.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6735.814453125, 'completions/min_terminated_length': 902.0, 'completions/max_terminated_length': 15191.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020044857636094093, 'sampling/sampling_logp_difference/max': 4.4225006103515625, 'sampling/importance_sampling_ratio/min': 0.012004177086055279, 'sampling/importance_sampling_ratio/mean': 1.0000591278076172, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.3717617441197945e-05, 'epoch': 0.6}
+
+ 64%|██████▍   | 657/1024 [29:18:08<18:08:53, 178.02s/it][AINFO 12-02 00:49:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:49:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▍   | 658/1024 [29:20:52<17:40:26, 173.84s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001547435880638659, 'learning_rate': 1e-05, 'num_tokens': 585703463.0, 'completions/mean_length': 6909.40625, 'completions/min_length': 1336.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6759.01611328125, 'completions/min_terminated_length': 1336.0, 'completions/max_terminated_length': 16071.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.22225633263587952, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020823702216148376, 'sampling/sampling_logp_difference/max': 2.9022250175476074, 'sampling/importance_sampling_ratio/min': 0.05490092933177948, 'sampling/importance_sampling_ratio/mean': 1.0000979900360107, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.319362005844596e-05, 'epoch': 0.61}
+
+ 64%|██████▍   | 658/1024 [29:20:52<17:40:26, 173.84s/it][AINFO 12-02 00:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:52:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▍   | 659/1024 [29:23:30<17:08:16, 169.03s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0028684926219284534, 'learning_rate': 1e-05, 'num_tokens': 586608077.0, 'completions/mean_length': 6885.421875, 'completions/min_length': 1063.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6734.6513671875, 'completions/min_terminated_length': 1063.0, 'completions/max_terminated_length': 15006.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2382849156856537, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020485304296016693, 'sampling/sampling_logp_difference/max': 10.287874221801758, 'sampling/importance_sampling_ratio/min': 3.404340532142669e-05, 'sampling/importance_sampling_ratio/mean': 0.9999911785125732, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.285892919142498e-05, 'epoch': 0.61}
+
+ 64%|██████▍   | 659/1024 [29:23:30<17:08:16, 169.03s/it][AINFO 12-02 00:54:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:54:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:54:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:54:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 64%|██████▍   | 660/1024 [29:26:38<17:40:29, 174.81s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011183511232957244, 'learning_rate': 1e-05, 'num_tokens': 587716955.0, 'completions/mean_length': 8497.796875, 'completions/min_length': 1707.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7756.359375, 'completions/min_terminated_length': 1707.0, 'completions/max_terminated_length': 15744.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.18648965656757355, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02050340734422207, 'sampling/sampling_logp_difference/max': 6.5563812255859375, 'sampling/importance_sampling_ratio/min': 0.001421018736436963, 'sampling/importance_sampling_ratio/mean': 1.0000625848770142, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.586447891417265e-05, 'epoch': 0.61}
+
+ 64%|██████▍   | 660/1024 [29:26:38<17:40:29, 174.81s/it][AINFO 12-02 00:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 00:57:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 661/1024 [29:29:38<17:46:14, 176.24s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0019862784538418055, 'learning_rate': 1e-05, 'num_tokens': 588768801.0, 'completions/mean_length': 8033.984375, 'completions/min_length': 810.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7248.9404296875, 'completions/min_terminated_length': 810.0, 'completions/max_terminated_length': 15953.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2840767204761505, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019978979602456093, 'sampling/sampling_logp_difference/max': 3.931173801422119, 'sampling/importance_sampling_ratio/min': 0.019620629027485847, 'sampling/importance_sampling_ratio/mean': 1.000004768371582, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3120662187211565e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 661/1024 [29:29:38<17:46:14, 176.24s/it][AINFO 12-02 01:00:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:00:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:00:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:00:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 662/1024 [29:32:22<17:21:49, 172.68s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011825362453237176, 'learning_rate': 1e-05, 'num_tokens': 589741948.0, 'completions/mean_length': 7432.3984375, 'completions/min_length': 1786.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7217.560546875, 'completions/min_terminated_length': 1786.0, 'completions/max_terminated_length': 15960.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.27670514583587646, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019830189645290375, 'sampling/sampling_logp_difference/max': 3.2881827354431152, 'sampling/importance_sampling_ratio/min': 0.03732161223888397, 'sampling/importance_sampling_ratio/mean': 1.0000401735305786, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.751122039830079e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 662/1024 [29:32:22<17:21:49, 172.68s/it][AINFO 12-02 01:03:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:03:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:03:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:03:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 65%|██████▍   | 663/1024 [29:35:39<18:03:35, 180.10s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020902431569993496, 'learning_rate': 1e-05, 'num_tokens': 590792605.0, 'completions/mean_length': 8069.8828125, 'completions/min_length': 973.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7441.08447265625, 'completions/min_terminated_length': 973.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.4200565218925476, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.020122863352298737, 'sampling/sampling_logp_difference/max': 4.315314769744873, 'sampling/importance_sampling_ratio/min': 0.013362343423068523, 'sampling/importance_sampling_ratio/mean': 1.0000402927398682, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.626948379562236e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 663/1024 [29:35:39<18:03:35, 180.10s/it][AINFO 12-02 01:06:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:06:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:06:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:06:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 65%|██████▍   | 664/1024 [29:38:27<17:37:20, 176.22s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012767234584316611, 'learning_rate': 1e-05, 'num_tokens': 591845270.0, 'completions/mean_length': 8079.5703125, 'completions/min_length': 1170.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7525.94189453125, 'completions/min_terminated_length': 1170.0, 'completions/max_terminated_length': 16070.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.32325342297554016, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020189162343740463, 'sampling/sampling_logp_difference/max': 5.255929946899414, 'sampling/importance_sampling_ratio/min': 0.005216493271291256, 'sampling/importance_sampling_ratio/mean': 1.0000768899917603, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.671091776093817e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 664/1024 [29:38:27<17:37:20, 176.22s/it][AINFO 12-02 01:09:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:09:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:09:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:09:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 65%|██████▍   | 665/1024 [29:41:29<17:46:09, 178.19s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012426557950675488, 'learning_rate': 1e-05, 'num_tokens': 592934289.0, 'completions/mean_length': 8355.0859375, 'completions/min_length': 1161.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8028.70703125, 'completions/min_terminated_length': 1161.0, 'completions/max_terminated_length': 16146.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020710071548819542, 'sampling/sampling_logp_difference/max': 10.880023956298828, 'sampling/importance_sampling_ratio/min': 1.8830663975677453e-05, 'sampling/importance_sampling_ratio/mean': 0.9999717473983765, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.110763274591591e-05, 'epoch': 0.61}
+
+ 65%|██████▍   | 665/1024 [29:41:29<17:46:09, 178.19s/it][AINFO 12-02 01:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:12:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:12:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 65%|██████▌   | 666/1024 [29:44:34<17:54:24, 180.07s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002299762796610594, 'learning_rate': 1e-05, 'num_tokens': 594084200.0, 'completions/mean_length': 8827.9921875, 'completions/min_length': 825.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8456.384765625, 'completions/min_terminated_length': 825.0, 'completions/max_terminated_length': 16159.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3261364698410034, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01951008290052414, 'sampling/sampling_logp_difference/max': 1.993185043334961, 'sampling/importance_sampling_ratio/min': 0.1362607330083847, 'sampling/importance_sampling_ratio/mean': 0.9999823570251465, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.733991090688505e-05, 'epoch': 0.61}
+
+ 65%|██████▌   | 666/1024 [29:44:34<17:54:24, 180.07s/it][AINFO 12-02 01:15:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:15:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:15:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:15:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 65%|██████▌   | 667/1024 [29:47:27<17:38:58, 177.98s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012675058096647263, 'learning_rate': 1e-05, 'num_tokens': 595061199.0, 'completions/mean_length': 7478.6171875, 'completions/min_length': 1448.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7264.88818359375, 'completions/min_terminated_length': 1448.0, 'completions/max_terminated_length': 15922.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.21146979928016663, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019648097455501556, 'sampling/sampling_logp_difference/max': 2.6123342514038086, 'sampling/importance_sampling_ratio/min': 0.07336309552192688, 'sampling/importance_sampling_ratio/mean': 1.0000317096710205, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0316146535369626e-05, 'epoch': 0.61}
+
+ 65%|██████▌   | 667/1024 [29:47:27<17:38:58, 177.98s/it][AINFO 12-02 01:18:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:18:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:18:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:18:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 65%|██████▌   | 668/1024 [29:50:40<18:02:33, 182.45s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009600589983165264, 'learning_rate': 1e-05, 'num_tokens': 596076232.0, 'completions/mean_length': 7763.6328125, 'completions/min_length': 948.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7264.93359375, 'completions/min_terminated_length': 948.0, 'completions/max_terminated_length': 15948.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.34063735604286194, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017017195001244545, 'sampling/sampling_logp_difference/max': 8.537160873413086, 'sampling/importance_sampling_ratio/min': 0.00019604606495704502, 'sampling/importance_sampling_ratio/mean': 0.9999954700469971, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.2480785092921e-05, 'epoch': 0.61}
+
+ 65%|██████▌   | 668/1024 [29:50:40<18:02:33, 182.45s/it][AINFO 12-02 01:21:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:21:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 65%|██████▌   | 669/1024 [29:53:51<18:15:27, 185.15s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011911778710782528, 'learning_rate': 1e-05, 'num_tokens': 597149131.0, 'completions/mean_length': 8223.9609375, 'completions/min_length': 858.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7822.64697265625, 'completions/min_terminated_length': 858.0, 'completions/max_terminated_length': 15998.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2788218855857849, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018567483872175217, 'sampling/sampling_logp_difference/max': 2.8308932781219482, 'sampling/importance_sampling_ratio/min': 0.05896016210317612, 'sampling/importance_sampling_ratio/mean': 1.0000197887420654, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.315408631962782e-05, 'epoch': 0.62}
+
+ 65%|██████▌   | 669/1024 [29:53:51<18:15:27, 185.15s/it][AINFO 12-02 01:25:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:25:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:25:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:25:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 65%|██████▌   | 670/1024 [29:56:30<17:25:17, 177.17s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019217046210542321, 'learning_rate': 1e-05, 'num_tokens': 598017240.0, 'completions/mean_length': 6639.4765625, 'completions/min_length': 993.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6405.6083984375, 'completions/min_terminated_length': 993.0, 'completions/max_terminated_length': 14667.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2937847673892975, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01841411553323269, 'sampling/sampling_logp_difference/max': 1.8561670780181885, 'sampling/importance_sampling_ratio/min': 0.15627045929431915, 'sampling/importance_sampling_ratio/mean': 1.0000083446502686, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.721546883956762e-05, 'epoch': 0.62}
+
+ 65%|██████▌   | 670/1024 [29:56:30<17:25:17, 177.17s/it][AINFO 12-02 01:27:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:27:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:27:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:27:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 671/1024 [29:59:17<17:04:11, 174.08s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010638483799993992, 'learning_rate': 1e-05, 'num_tokens': 598925576.0, 'completions/mean_length': 6937.75, 'completions/min_length': 1218.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6711.04052734375, 'completions/min_terminated_length': 1218.0, 'completions/max_terminated_length': 16107.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.23857943713665009, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019752617925405502, 'sampling/sampling_logp_difference/max': 3.0034961700439453, 'sampling/importance_sampling_ratio/min': 0.049613308161497116, 'sampling/importance_sampling_ratio/mean': 1.0000689029693604, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2278860721344245e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 671/1024 [29:59:17<17:04:11, 174.08s/it][AINFO 12-02 01:30:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:30:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:30:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:30:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 66%|██████▌   | 672/1024 [30:02:14<17:06:18, 174.94s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010879082838073373, 'learning_rate': 1e-05, 'num_tokens': 599903752.0, 'completions/mean_length': 7486.3125, 'completions/min_length': 1072.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6971.56982421875, 'completions/min_terminated_length': 1072.0, 'completions/max_terminated_length': 16238.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.32195523381233215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019042087718844414, 'sampling/sampling_logp_difference/max': 3.769152879714966, 'sampling/importance_sampling_ratio/min': 0.02307160012423992, 'sampling/importance_sampling_ratio/mean': 0.9999791383743286, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.333740059792035e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 672/1024 [30:02:14<17:06:18, 174.94s/it][AINFO 12-02 01:33:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:33:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:33:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:33:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 66%|██████▌   | 673/1024 [30:05:33<17:45:17, 182.10s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0016081302892416716, 'learning_rate': 1e-05, 'num_tokens': 601109759.0, 'completions/mean_length': 9270.6171875, 'completions/min_length': 1622.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 8466.4951171875, 'completions/min_terminated_length': 1622.0, 'completions/max_terminated_length': 16269.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.22567617893218994, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01906023919582367, 'sampling/sampling_logp_difference/max': 3.3350718021392822, 'sampling/importance_sampling_ratio/min': 0.03561202809214592, 'sampling/importance_sampling_ratio/mean': 1.0000009536743164, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.45755602211284e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 673/1024 [30:05:33<17:45:17, 182.10s/it][AINFO 12-02 01:36:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:36:49 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 674/1024 [30:08:28<17:31:10, 180.20s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0006001724395900965, 'learning_rate': 1e-05, 'num_tokens': 602146904.0, 'completions/mean_length': 7964.0078125, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7476.900390625, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15726.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2359209954738617, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017905468121170998, 'sampling/sampling_logp_difference/max': 2.1254849433898926, 'sampling/importance_sampling_ratio/min': 0.1193750649690628, 'sampling/importance_sampling_ratio/mean': 1.0000492334365845, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.0782626001600875e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 674/1024 [30:08:28<17:31:10, 180.20s/it][AINFO 12-02 01:39:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:39:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:39:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:39:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 66%|██████▌   | 675/1024 [30:11:31<17:33:20, 181.09s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013308044290170074, 'learning_rate': 1e-05, 'num_tokens': 603114542.0, 'completions/mean_length': 7436.734375, 'completions/min_length': 762.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6760.05078125, 'completions/min_terminated_length': 762.0, 'completions/max_terminated_length': 16116.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2238539308309555, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01774020865559578, 'sampling/sampling_logp_difference/max': 9.993271827697754, 'sampling/importance_sampling_ratio/min': 4.570641976897605e-05, 'sampling/importance_sampling_ratio/mean': 1.0000581741333008, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.838838544354076e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 675/1024 [30:11:31<17:33:20, 181.09s/it][AINFO 12-02 01:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:42:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 676/1024 [30:14:30<17:26:00, 180.35s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001520191435702145, 'learning_rate': 1e-05, 'num_tokens': 604135191.0, 'completions/mean_length': 7814.3828125, 'completions/min_length': 1544.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7318.61962890625, 'completions/min_terminated_length': 1544.0, 'completions/max_terminated_length': 15812.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2212003767490387, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018672889098525047, 'sampling/sampling_logp_difference/max': 10.216349601745605, 'sampling/importance_sampling_ratio/min': 3.6567540519172326e-05, 'sampling/importance_sampling_ratio/mean': 1.0000321865081787, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.249965872484609e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 676/1024 [30:14:30<17:26:00, 180.35s/it][AINFO 12-02 01:45:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:45:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:45:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:45:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▌   | 677/1024 [30:17:21<17:06:09, 177.43s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003033524611964822, 'learning_rate': 1e-05, 'num_tokens': 605101508.0, 'completions/mean_length': 7376.2890625, 'completions/min_length': 929.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7085.7177734375, 'completions/min_terminated_length': 929.0, 'completions/max_terminated_length': 15891.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.33510076999664307, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020018117502331734, 'sampling/sampling_logp_difference/max': 1.7408509254455566, 'sampling/importance_sampling_ratio/min': 0.17537111043930054, 'sampling/importance_sampling_ratio/mean': 1.0001227855682373, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.985021430708002e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 677/1024 [30:17:21<17:06:09, 177.43s/it][AINFO 12-02 01:48:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:48:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:48:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:48:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 66%|██████▌   | 678/1024 [30:20:10<16:49:07, 174.99s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001867604092694819, 'learning_rate': 1e-05, 'num_tokens': 605900464.0, 'completions/mean_length': 6084.15625, 'completions/min_length': 858.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5751.90283203125, 'completions/min_terminated_length': 858.0, 'completions/max_terminated_length': 16339.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.23068872094154358, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0177907757461071, 'sampling/sampling_logp_difference/max': 4.193622589111328, 'sampling/importance_sampling_ratio/min': 0.015091515146195889, 'sampling/importance_sampling_ratio/mean': 0.999966025352478, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.559530000529776e-05, 'epoch': 0.62}
+
+ 66%|██████▌   | 678/1024 [30:20:10<16:49:07, 174.99s/it][AINFO 12-02 01:51:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:51:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:51:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:51:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 66%|██████▋   | 679/1024 [30:23:20<17:12:30, 179.57s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007769939838908613, 'learning_rate': 1e-05, 'num_tokens': 607100031.0, 'completions/mean_length': 9227.5546875, 'completions/min_length': 1852.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 8418.564453125, 'completions/min_terminated_length': 1852.0, 'completions/max_terminated_length': 15968.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.16675157845020294, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019650045782327652, 'sampling/sampling_logp_difference/max': 2.9031193256378174, 'sampling/importance_sampling_ratio/min': 0.05485185235738754, 'sampling/importance_sampling_ratio/mean': 1.000018835067749, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.393280178443092e-05, 'epoch': 0.62}
+
+ 66%|██████▋   | 679/1024 [30:23:20<17:12:30, 179.57s/it][AINFO 12-02 01:54:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:54:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:54:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:54:37 [block_pool.py:292] Successfully reset prefix cache
+
+ 66%|██████▋   | 680/1024 [30:26:21<17:10:43, 179.78s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019156413618475199, 'learning_rate': 1e-05, 'num_tokens': 608134999.0, 'completions/mean_length': 7930.25, 'completions/min_length': 1166.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7586.6015625, 'completions/min_terminated_length': 1166.0, 'completions/max_terminated_length': 16338.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.28353503346443176, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021187983453273773, 'sampling/sampling_logp_difference/max': 1.545616626739502, 'sampling/importance_sampling_ratio/min': 0.21318039298057556, 'sampling/importance_sampling_ratio/mean': 1.0000245571136475, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.556123201382434e-05, 'epoch': 0.63}
+
+ 66%|██████▋   | 680/1024 [30:26:21<17:10:43, 179.78s/it][AINFO 12-02 01:57:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:57:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:57:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 01:57:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 67%|██████▋   | 681/1024 [30:29:29<17:21:52, 182.25s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001825473504140973, 'learning_rate': 1e-05, 'num_tokens': 609312914.0, 'completions/mean_length': 9026.9609375, 'completions/min_length': 1516.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8665.138671875, 'completions/min_terminated_length': 1516.0, 'completions/max_terminated_length': 16083.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020096033811569214, 'sampling/sampling_logp_difference/max': 2.9243674278259277, 'sampling/importance_sampling_ratio/min': 0.05369865149259567, 'sampling/importance_sampling_ratio/mean': 1.0000073909759521, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.520854119822616e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 681/1024 [30:29:29<17:21:52, 182.25s/it][AINFO 12-02 02:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:00:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 67%|██████▋   | 682/1024 [30:32:30<17:16:44, 181.89s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0016420227475464344, 'learning_rate': 1e-05, 'num_tokens': 610533958.0, 'completions/mean_length': 9398.59375, 'completions/min_length': 2084.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 8741.8466796875, 'completions/min_terminated_length': 2084.0, 'completions/max_terminated_length': 16127.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.27540695667266846, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01912839338183403, 'sampling/sampling_logp_difference/max': 2.357304573059082, 'sampling/importance_sampling_ratio/min': 0.09467507153749466, 'sampling/importance_sampling_ratio/mean': 1.0000451803207397, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.422855656433967e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 682/1024 [30:32:30<17:16:44, 181.89s/it][AINFO 12-02 02:03:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:03:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 683/1024 [30:35:02<16:23:25, 173.04s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018911457154899836, 'learning_rate': 1e-05, 'num_tokens': 611336965.0, 'completions/mean_length': 6124.8671875, 'completions/min_length': 1355.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5962.02392578125, 'completions/min_terminated_length': 1355.0, 'completions/max_terminated_length': 15225.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2517249882221222, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018583890050649643, 'sampling/sampling_logp_difference/max': 1.9649643898010254, 'sampling/importance_sampling_ratio/min': 0.1663089394569397, 'sampling/importance_sampling_ratio/mean': 1.000014066696167, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1790173352419515e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 683/1024 [30:35:02<16:23:25, 173.04s/it][AINFO 12-02 02:06:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:06:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 67%|██████▋   | 684/1024 [30:37:51<16:13:28, 171.79s/it][A
+                                                         [A{'loss': -0.0001, 'grad_norm': 0.002155512571334839, 'learning_rate': 1e-05, 'num_tokens': 612302994.0, 'completions/mean_length': 7395.6015625, 'completions/min_length': 1318.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7179.88037109375, 'completions/min_terminated_length': 1318.0, 'completions/max_terminated_length': 15490.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.36691081523895264, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020452633500099182, 'sampling/sampling_logp_difference/max': 2.083150863647461, 'sampling/importance_sampling_ratio/min': 0.13337723910808563, 'sampling/importance_sampling_ratio/mean': 0.9999696016311646, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.392584236891707e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 684/1024 [30:37:51<16:13:28, 171.79s/it][AINFO 12-02 02:09:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:09:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:09:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:09:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 67%|██████▋   | 685/1024 [30:40:34<15:56:40, 169.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015657866606488824, 'learning_rate': 1e-05, 'num_tokens': 613281160.0, 'completions/mean_length': 7513.796875, 'completions/min_length': 1341.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7153.21923828125, 'completions/min_terminated_length': 1341.0, 'completions/max_terminated_length': 16349.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3527093529701233, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019633764401078224, 'sampling/sampling_logp_difference/max': 3.7076854705810547, 'sampling/importance_sampling_ratio/min': 0.02453424222767353, 'sampling/importance_sampling_ratio/mean': 1.0000494718551636, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.593643522341154e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 685/1024 [30:40:34<15:56:40, 169.32s/it][AINFO 12-02 02:11:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:11:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 67%|██████▋   | 686/1024 [30:43:19<15:46:36, 168.04s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0015854707453399897, 'learning_rate': 1e-05, 'num_tokens': 614196112.0, 'completions/mean_length': 7018.625, 'completions/min_length': 644.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6476.826171875, 'completions/min_terminated_length': 644.0, 'completions/max_terminated_length': 16064.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2477683573961258, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01705239713191986, 'sampling/sampling_logp_difference/max': 2.7290499210357666, 'sampling/importance_sampling_ratio/min': 0.06528128683567047, 'sampling/importance_sampling_ratio/mean': 1.0000513792037964, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.8048506414488656e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 686/1024 [30:43:19<15:46:36, 168.04s/it][AINFO 12-02 02:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:14:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 687/1024 [30:46:15<15:56:56, 170.38s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018423815490677953, 'learning_rate': 1e-05, 'num_tokens': 615189455.0, 'completions/mean_length': 7600.1796875, 'completions/min_length': 1231.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7316.83056640625, 'completions/min_terminated_length': 1231.0, 'completions/max_terminated_length': 15886.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.33402228355407715, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01919039711356163, 'sampling/sampling_logp_difference/max': 7.622128009796143, 'sampling/importance_sampling_ratio/min': 0.0004894990706816316, 'sampling/importance_sampling_ratio/mean': 0.9999613761901855, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.763172364619095e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 687/1024 [30:46:15<15:56:56, 170.38s/it][AINFO 12-02 02:17:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:17:32 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 688/1024 [30:48:53<15:33:17, 166.66s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022319757845252752, 'learning_rate': 1e-05, 'num_tokens': 616115638.0, 'completions/mean_length': 7087.9921875, 'completions/min_length': 1088.0, 'completions/max_length': 15845.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7087.9921875, 'completions/min_terminated_length': 1088.0, 'completions/max_terminated_length': 15845.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3543020486831665, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01995745301246643, 'sampling/sampling_logp_difference/max': 3.076416254043579, 'sampling/importance_sampling_ratio/min': 0.046124257147312164, 'sampling/importance_sampling_ratio/mean': 0.9999686479568481, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.0350538205966586e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 688/1024 [30:48:53<15:33:17, 166.66s/it][AINFO 12-02 02:20:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:20:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:20:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:20:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 67%|██████▋   | 689/1024 [30:52:02<16:07:26, 173.27s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0011952260974794626, 'learning_rate': 1e-05, 'num_tokens': 617261056.0, 'completions/mean_length': 8789.203125, 'completions/min_length': 1460.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8282.8837890625, 'completions/min_terminated_length': 1460.0, 'completions/max_terminated_length': 15655.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.28801077604293823, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02010430209338665, 'sampling/sampling_logp_difference/max': 1.8787459135055542, 'sampling/importance_sampling_ratio/min': 0.15278159081935883, 'sampling/importance_sampling_ratio/mean': 0.9999961256980896, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.658339705405524e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 689/1024 [30:52:02<16:07:26, 173.27s/it][AINFO 12-02 02:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:23:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 67%|██████▋   | 690/1024 [30:55:01<16:14:57, 175.14s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001034099142998457, 'learning_rate': 1e-05, 'num_tokens': 618333060.0, 'completions/mean_length': 8213.21875, 'completions/min_length': 1174.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7595.2607421875, 'completions/min_terminated_length': 1174.0, 'completions/max_terminated_length': 16328.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019167877733707428, 'sampling/sampling_logp_difference/max': 5.747104167938232, 'sampling/importance_sampling_ratio/min': 0.0031920108012855053, 'sampling/importance_sampling_ratio/mean': 0.9999808073043823, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.9363259020319674e-05, 'epoch': 0.63}
+
+ 67%|██████▋   | 690/1024 [30:55:01<16:14:57, 175.14s/it][AINFO 12-02 02:26:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:26:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:26:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:26:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 67%|██████▋   | 691/1024 [30:58:06<16:27:04, 177.85s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0035765625070780516, 'learning_rate': 1e-05, 'num_tokens': 619368977.0, 'completions/mean_length': 7931.3515625, 'completions/min_length': 851.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7728.48828125, 'completions/min_terminated_length': 851.0, 'completions/max_terminated_length': 16067.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020664311945438385, 'sampling/sampling_logp_difference/max': 5.807402610778809, 'sampling/importance_sampling_ratio/min': 0.0030052256770431995, 'sampling/importance_sampling_ratio/mean': 1.0000253915786743, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.5601640838176536e-05, 'epoch': 0.64}
+
+ 67%|██████▋   | 691/1024 [30:58:06<16:27:04, 177.85s/it][AINFO 12-02 02:29:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:29:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:29:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:29:22 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 692/1024 [31:00:44<15:51:24, 171.94s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0014891134342178702, 'learning_rate': 1e-05, 'num_tokens': 620193631.0, 'completions/mean_length': 6304.984375, 'completions/min_length': 710.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6063.08837890625, 'completions/min_terminated_length': 710.0, 'completions/max_terminated_length': 15955.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.21040897071361542, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019624091684818268, 'sampling/sampling_logp_difference/max': 4.340904712677002, 'sampling/importance_sampling_ratio/min': 0.013024738989770412, 'sampling/importance_sampling_ratio/mean': 0.9999635219573975, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.8633566898861318e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 692/1024 [31:00:44<15:51:24, 171.94s/it][AINFO 12-02 02:32:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:32:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:32:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:32:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 693/1024 [31:03:26<15:32:01, 168.95s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002390529727563262, 'learning_rate': 1e-05, 'num_tokens': 621149436.0, 'completions/mean_length': 7254.9765625, 'completions/min_length': 730.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7110.07177734375, 'completions/min_terminated_length': 730.0, 'completions/max_terminated_length': 15276.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2688046097755432, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021049557253718376, 'sampling/sampling_logp_difference/max': 2.1839022636413574, 'sampling/importance_sampling_ratio/min': 0.11260127276182175, 'sampling/importance_sampling_ratio/mean': 0.9999436140060425, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7963828788178944e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 693/1024 [31:03:26<15:32:01, 168.95s/it][AINFO 12-02 02:34:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:34:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 694/1024 [31:06:03<15:09:15, 165.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017312433337792754, 'learning_rate': 1e-05, 'num_tokens': 621970854.0, 'completions/mean_length': 6258.140625, 'completions/min_length': 1043.0, 'completions/max_length': 16072.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6258.140625, 'completions/min_terminated_length': 1043.0, 'completions/max_terminated_length': 16072.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.640625, 'reward_std': 0.2419992983341217, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018020927906036377, 'sampling/sampling_logp_difference/max': 14.749898910522461, 'sampling/importance_sampling_ratio/min': 3.9282605257540126e-07, 'sampling/importance_sampling_ratio/mean': 0.9999521374702454, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.523629382267245e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 694/1024 [31:06:03<15:09:15, 165.32s/it][AINFO 12-02 02:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:37:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 695/1024 [31:08:56<15:19:36, 167.71s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011407635174691677, 'learning_rate': 1e-05, 'num_tokens': 622986612.0, 'completions/mean_length': 7797.296875, 'completions/min_length': 1147.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7374.99951171875, 'completions/min_terminated_length': 1147.0, 'completions/max_terminated_length': 15208.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.17282496392726898, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0201115645468235, 'sampling/sampling_logp_difference/max': 4.877412796020508, 'sampling/importance_sampling_ratio/min': 0.007616694550961256, 'sampling/importance_sampling_ratio/mean': 1.0000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.9848634262161795e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 695/1024 [31:08:56<15:19:36, 167.71s/it][AINFO 12-02 02:40:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:40:13 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 696/1024 [31:11:45<15:19:10, 168.14s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.004195662681013346, 'learning_rate': 1e-05, 'num_tokens': 623856702.0, 'completions/mean_length': 6612.078125, 'completions/min_length': 814.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6377.55224609375, 'completions/min_terminated_length': 814.0, 'completions/max_terminated_length': 16333.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32089442014694214, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018910642713308334, 'sampling/sampling_logp_difference/max': 3.5543971061706543, 'sampling/importance_sampling_ratio/min': 0.028598612174391747, 'sampling/importance_sampling_ratio/mean': 0.9999648928642273, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.619588248213404e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 696/1024 [31:11:45<15:19:10, 168.14s/it][AINFO 12-02 02:43:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:43:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 697/1024 [31:14:34<15:17:14, 168.30s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014166319742798805, 'learning_rate': 1e-05, 'num_tokens': 624691458.0, 'completions/mean_length': 6374.09375, 'completions/min_length': 1414.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6133.8564453125, 'completions/min_terminated_length': 1414.0, 'completions/max_terminated_length': 15310.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.21040897071361542, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018751274794340134, 'sampling/sampling_logp_difference/max': 2.1256957054138184, 'sampling/importance_sampling_ratio/min': 0.119349904358387, 'sampling/importance_sampling_ratio/mean': 1.0000518560409546, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.04452064231009e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 697/1024 [31:14:34<15:17:14, 168.30s/it][AINFO 12-02 02:45:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:45:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:45:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:45:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 698/1024 [31:17:27<15:22:32, 169.79s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0026315958239138126, 'learning_rate': 1e-05, 'num_tokens': 625685194.0, 'completions/mean_length': 7620.75, 'completions/min_length': 1584.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7264.52001953125, 'completions/min_terminated_length': 1584.0, 'completions/max_terminated_length': 16335.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.25012245774269104, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020508352667093277, 'sampling/sampling_logp_difference/max': 2.386033058166504, 'sampling/importance_sampling_ratio/min': 0.09199389815330505, 'sampling/importance_sampling_ratio/mean': 0.9999381899833679, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.3093026956976246e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 698/1024 [31:17:27<15:22:32, 169.79s/it][AINFO 12-02 02:48:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:48:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:48:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:48:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 699/1024 [31:20:18<15:21:05, 170.05s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015654662856832147, 'learning_rate': 1e-05, 'num_tokens': 626636478.0, 'completions/mean_length': 7288.78125, 'completions/min_length': 1305.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6762.611328125, 'completions/min_terminated_length': 1305.0, 'completions/max_terminated_length': 16090.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018975891172885895, 'sampling/sampling_logp_difference/max': 1.858222484588623, 'sampling/importance_sampling_ratio/min': 0.15594959259033203, 'sampling/importance_sampling_ratio/mean': 0.9999785423278809, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.289209179361933e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 699/1024 [31:20:18<15:21:05, 170.05s/it][AINFO 12-02 02:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:51:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 68%|██████▊   | 700/1024 [31:23:12<15:24:51, 171.27s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0017488511512055993, 'learning_rate': 1e-05, 'num_tokens': 627703278.0, 'completions/mean_length': 8185.875, 'completions/min_length': 714.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7639.33349609375, 'completions/min_terminated_length': 714.0, 'completions/max_terminated_length': 15871.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.28407180309295654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020175417885184288, 'sampling/sampling_logp_difference/max': 2.372049331665039, 'sampling/importance_sampling_ratio/min': 0.09328935295343399, 'sampling/importance_sampling_ratio/mean': 0.9999751448631287, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.69748144193727e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 700/1024 [31:23:12<15:24:51, 171.27s/it][AINFO 12-02 02:54:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:54:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:54:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:54:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 68%|██████▊   | 701/1024 [31:26:14<15:40:25, 174.69s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0007408512174151838, 'learning_rate': 1e-05, 'num_tokens': 628771362.0, 'completions/mean_length': 8191.15625, 'completions/min_length': 825.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7644.96728515625, 'completions/min_terminated_length': 825.0, 'completions/max_terminated_length': 15425.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.14913025498390198, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02076815254986286, 'sampling/sampling_logp_difference/max': 1.7933764457702637, 'sampling/importance_sampling_ratio/min': 0.16639739274978638, 'sampling/importance_sampling_ratio/mean': 0.9999026656150818, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.11413399142657e-05, 'epoch': 0.64}
+
+ 68%|██████▊   | 701/1024 [31:26:14<15:40:25, 174.69s/it][AINFO 12-02 02:57:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:57:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:57:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 02:57:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▊   | 702/1024 [31:29:01<15:24:37, 172.29s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020427817944437265, 'learning_rate': 1e-05, 'num_tokens': 629659404.0, 'completions/mean_length': 6771.390625, 'completions/min_length': 1211.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6618.81005859375, 'completions/min_terminated_length': 1211.0, 'completions/max_terminated_length': 15805.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.30221715569496155, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018300345167517662, 'sampling/sampling_logp_difference/max': 2.59881329536438, 'sampling/importance_sampling_ratio/min': 0.07436177134513855, 'sampling/importance_sampling_ratio/mean': 1.000025749206543, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.62914268584791e-05, 'epoch': 0.65}
+
+ 69%|██████▊   | 702/1024 [31:29:01<15:24:37, 172.29s/it][AINFO 12-02 03:00:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:00:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:00:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:00:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▊   | 703/1024 [31:32:01<15:34:34, 174.69s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.005669564474374056, 'learning_rate': 1e-05, 'num_tokens': 630562137.0, 'completions/mean_length': 6874.9140625, 'completions/min_length': 844.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6407.25390625, 'completions/min_terminated_length': 844.0, 'completions/max_terminated_length': 15702.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01879579946398735, 'sampling/sampling_logp_difference/max': 6.486120223999023, 'sampling/importance_sampling_ratio/min': 0.0015244520036503673, 'sampling/importance_sampling_ratio/mean': 0.9999817609786987, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.982662406087911e-05, 'epoch': 0.65}
+
+ 69%|██████▊   | 703/1024 [31:32:01<15:34:34, 174.69s/it][AINFO 12-02 03:03:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:03:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:03:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:03:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 704/1024 [31:34:44<15:12:38, 171.12s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.000894398894160986, 'learning_rate': 1e-05, 'num_tokens': 631473239.0, 'completions/mean_length': 6967.421875, 'completions/min_length': 794.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6817.95263671875, 'completions/min_terminated_length': 794.0, 'completions/max_terminated_length': 15922.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.18542881309986115, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01982988975942135, 'sampling/sampling_logp_difference/max': 6.328139781951904, 'sampling/importance_sampling_ratio/min': 0.0017853517783805728, 'sampling/importance_sampling_ratio/mean': 0.9999149441719055, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.1905532889832102e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 704/1024 [31:34:44<15:12:38, 171.12s/it][AINFO 12-02 03:06:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:06:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:06:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:06:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 705/1024 [31:37:22<14:48:25, 167.10s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021880087442696095, 'learning_rate': 1e-05, 'num_tokens': 632372214.0, 'completions/mean_length': 6852.8671875, 'completions/min_length': 1042.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6545.4111328125, 'completions/min_terminated_length': 1042.0, 'completions/max_terminated_length': 16352.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2380426526069641, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019545096904039383, 'sampling/sampling_logp_difference/max': 8.168079376220703, 'sampling/importance_sampling_ratio/min': 0.0002835621125996113, 'sampling/importance_sampling_ratio/mean': 1.0000286102294922, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6745424495165935e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 705/1024 [31:37:22<14:48:25, 167.10s/it][AINFO 12-02 03:08:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:08:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 706/1024 [31:39:48<14:11:30, 160.66s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0016789337387308478, 'learning_rate': 1e-05, 'num_tokens': 633312546.0, 'completions/mean_length': 7180.84375, 'completions/min_length': 887.0, 'completions/max_length': 15060.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7180.84375, 'completions/min_terminated_length': 887.0, 'completions/max_terminated_length': 15060.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.17358636856079102, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02291160076856613, 'sampling/sampling_logp_difference/max': 10.056546211242676, 'sampling/importance_sampling_ratio/min': 4.2903968278551474e-05, 'sampling/importance_sampling_ratio/mean': 0.9998760223388672, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.4689652459765057e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 706/1024 [31:39:48<14:11:30, 160.66s/it][AINFO 12-02 03:11:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:11:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:11:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:11:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 69%|██████▉   | 707/1024 [31:42:33<14:15:54, 162.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018599401228129864, 'learning_rate': 1e-05, 'num_tokens': 634310737.0, 'completions/mean_length': 7622.4921875, 'completions/min_length': 1080.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7412.21630859375, 'completions/min_terminated_length': 1080.0, 'completions/max_terminated_length': 15810.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.17176413536071777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022728879004716873, 'sampling/sampling_logp_difference/max': 5.2594099044799805, 'sampling/importance_sampling_ratio/min': 0.005198371596634388, 'sampling/importance_sampling_ratio/mean': 1.0000407695770264, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5338669565353484e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 707/1024 [31:42:33<14:15:54, 162.00s/it][AINFO 12-02 03:13:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:13:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:13:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:13:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 708/1024 [31:45:48<15:06:17, 172.08s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018327049911022186, 'learning_rate': 1e-05, 'num_tokens': 635315674.0, 'completions/mean_length': 7699.1328125, 'completions/min_length': 1221.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7042.29443359375, 'completions/min_terminated_length': 1221.0, 'completions/max_terminated_length': 16106.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.27722424268722534, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01952367275953293, 'sampling/sampling_logp_difference/max': 2.3502726554870605, 'sampling/importance_sampling_ratio/min': 0.09534315764904022, 'sampling/importance_sampling_ratio/mean': 0.9999791383743286, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1668479980216944e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 708/1024 [31:45:48<15:06:17, 172.08s/it][AINFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:17:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 709/1024 [31:48:37<14:58:45, 171.19s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017338332254439592, 'learning_rate': 1e-05, 'num_tokens': 636323012.0, 'completions/mean_length': 7696.203125, 'completions/min_length': 1709.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7558.30224609375, 'completions/min_terminated_length': 1709.0, 'completions/max_terminated_length': 15984.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2517200708389282, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021855801343917847, 'sampling/sampling_logp_difference/max': 3.77878999710083, 'sampling/importance_sampling_ratio/min': 0.02285032533109188, 'sampling/importance_sampling_ratio/mean': 1.0000792741775513, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.31190760032041e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 709/1024 [31:48:37<14:58:45, 171.19s/it][AINFO 12-02 03:19:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:19:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 710/1024 [31:51:55<15:37:38, 179.17s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0029061399400234222, 'learning_rate': 1e-05, 'num_tokens': 637380330.0, 'completions/mean_length': 8118.921875, 'completions/min_length': 1077.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7712.4423828125, 'completions/min_terminated_length': 1077.0, 'completions/max_terminated_length': 15943.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3037971258163452, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018279653042554855, 'sampling/sampling_logp_difference/max': 3.7112560272216797, 'sampling/importance_sampling_ratio/min': 0.024446796625852585, 'sampling/importance_sampling_ratio/mean': 1.0000308752059937, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.814407543562993e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 710/1024 [31:51:55<15:37:38, 179.17s/it][AINFO 12-02 03:23:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:23:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:23:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:23:12 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 69%|██████▉   | 711/1024 [31:54:26<14:50:39, 170.73s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.003259100718423724, 'learning_rate': 1e-05, 'num_tokens': 638227170.0, 'completions/mean_length': 6462.625, 'completions/min_length': 1014.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6142.58056640625, 'completions/min_terminated_length': 1014.0, 'completions/max_terminated_length': 15116.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.21937817335128784, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02055167965590954, 'sampling/sampling_logp_difference/max': 1.3961727619171143, 'sampling/importance_sampling_ratio/min': 0.2475425750017166, 'sampling/importance_sampling_ratio/mean': 0.9999736547470093, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.779211571483756e-05, 'epoch': 0.65}
+
+ 69%|██████▉   | 711/1024 [31:54:26<14:50:39, 170.73s/it][AINFO 12-02 03:25:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:25:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:25:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:25:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|██████▉   | 712/1024 [31:57:04<14:27:26, 166.82s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.003238550154492259, 'learning_rate': 1e-05, 'num_tokens': 639182076.0, 'completions/mean_length': 7312.828125, 'completions/min_length': 1419.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7020.20947265625, 'completions/min_terminated_length': 1419.0, 'completions/max_terminated_length': 15210.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3505876660346985, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020643584430217743, 'sampling/sampling_logp_difference/max': 1.3967032432556152, 'sampling/importance_sampling_ratio/min': 0.24741129577159882, 'sampling/importance_sampling_ratio/mean': 0.9999760985374451, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.928118409632589e-05, 'epoch': 0.66}
+
+ 70%|██████▉   | 712/1024 [31:57:04<14:27:26, 166.82s/it][AINFO 12-02 03:28:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:28:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:28:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:28:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|██████▉   | 713/1024 [32:00:08<14:50:53, 171.88s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0035087091382592916, 'learning_rate': 1e-05, 'num_tokens': 640197939.0, 'completions/mean_length': 7793.3671875, 'completions/min_length': 1473.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7444.154296875, 'completions/min_terminated_length': 1473.0, 'completions/max_terminated_length': 16214.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01964278146624565, 'sampling/sampling_logp_difference/max': 2.365344285964966, 'sampling/importance_sampling_ratio/min': 0.09391696006059647, 'sampling/importance_sampling_ratio/mean': 1.0000097751617432, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3987408389511984e-05, 'epoch': 0.66}
+
+ 70%|██████▉   | 713/1024 [32:00:08<14:50:53, 171.88s/it][AINFO 12-02 03:31:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:31:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:31:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:31:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|██████▉   | 714/1024 [32:02:43<14:23:03, 167.04s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020325910300016403, 'learning_rate': 1e-05, 'num_tokens': 641114516.0, 'completions/mean_length': 7015.6328125, 'completions/min_length': 893.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6713.42724609375, 'completions/min_terminated_length': 893.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3624350428581238, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018963608890771866, 'sampling/sampling_logp_difference/max': 4.708624839782715, 'sampling/importance_sampling_ratio/min': 0.00901716947555542, 'sampling/importance_sampling_ratio/mean': 1.0000211000442505, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.82506984048814e-05, 'epoch': 0.66}
+
+ 70%|██████▉   | 714/1024 [32:02:43<14:23:03, 167.04s/it][AINFO 12-02 03:34:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:34:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:34:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:34:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|██████▉   | 715/1024 [32:06:03<15:10:11, 176.74s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001088503748178482, 'learning_rate': 1e-05, 'num_tokens': 642304197.0, 'completions/mean_length': 9155.2578125, 'completions/min_length': 1227.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 8542.65234375, 'completions/min_terminated_length': 1227.0, 'completions/max_terminated_length': 15972.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.29432153701782227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019820773974061012, 'sampling/sampling_logp_difference/max': 2.659818172454834, 'sampling/importance_sampling_ratio/min': 0.06996094435453415, 'sampling/importance_sampling_ratio/mean': 0.9999903440475464, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.251352599269012e-05, 'epoch': 0.66}
+
+ 70%|██████▉   | 715/1024 [32:06:03<15:10:11, 176.74s/it][AINFO 12-02 03:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:37:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:37:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|██████▉   | 716/1024 [32:08:59<15:07:06, 176.71s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002052560681477189, 'learning_rate': 1e-05, 'num_tokens': 643376142.0, 'completions/mean_length': 8232.4453125, 'completions/min_length': 855.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7615.94140625, 'completions/min_terminated_length': 855.0, 'completions/max_terminated_length': 15992.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2790641486644745, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020947234705090523, 'sampling/sampling_logp_difference/max': 6.214273452758789, 'sampling/importance_sampling_ratio/min': 0.0020006694830954075, 'sampling/importance_sampling_ratio/mean': 1.0000183582305908, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.702325142920017e-05, 'epoch': 0.66}
+
+ 70%|██████▉   | 716/1024 [32:08:59<15:07:06, 176.71s/it][AINFO 12-02 03:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:40:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|███████   | 717/1024 [32:11:48<14:52:14, 174.38s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019588759168982506, 'learning_rate': 1e-05, 'num_tokens': 644173578.0, 'completions/mean_length': 6067.53125, 'completions/min_length': 903.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5648.16259765625, 'completions/min_terminated_length': 903.0, 'completions/max_terminated_length': 15794.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.22803518176078796, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01862824335694313, 'sampling/sampling_logp_difference/max': 1.9100069999694824, 'sampling/importance_sampling_ratio/min': 0.14807935059070587, 'sampling/importance_sampling_ratio/mean': 0.9999874830245972, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7193092566667474e-05, 'epoch': 0.66}
+
+ 70%|███████   | 717/1024 [32:11:48<14:52:14, 174.38s/it][AINFO 12-02 03:43:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:43:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|███████   | 718/1024 [32:14:51<15:02:03, 176.87s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020787471439689398, 'learning_rate': 1e-05, 'num_tokens': 645226383.0, 'completions/mean_length': 8060.7265625, 'completions/min_length': 1639.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7928.611328125, 'completions/min_terminated_length': 1639.0, 'completions/max_terminated_length': 15771.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.23751862347126007, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020670078694820404, 'sampling/sampling_logp_difference/max': 3.9198334217071533, 'sampling/importance_sampling_ratio/min': 0.019844399765133858, 'sampling/importance_sampling_ratio/mean': 0.9999464750289917, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.875388506457966e-05, 'epoch': 0.66}
+
+ 70%|███████   | 718/1024 [32:14:51<15:02:03, 176.87s/it][AINFO 12-02 03:46:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:46:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:46:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:46:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 70%|███████   | 719/1024 [32:17:29<14:30:33, 171.26s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0025326372124254704, 'learning_rate': 1e-05, 'num_tokens': 646168957.0, 'completions/mean_length': 7212.484375, 'completions/min_length': 1313.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7066.9052734375, 'completions/min_terminated_length': 1313.0, 'completions/max_terminated_length': 15838.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.28011515736579895, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020189261063933372, 'sampling/sampling_logp_difference/max': 2.186849594116211, 'sampling/importance_sampling_ratio/min': 0.11226989328861237, 'sampling/importance_sampling_ratio/mean': 0.999981164932251, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.065027110300434e-05, 'epoch': 0.66}
+
+ 70%|███████   | 719/1024 [32:17:29<14:30:33, 171.26s/it][AINFO 12-02 03:48:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:48:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:48:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:48:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 70%|███████   | 720/1024 [32:20:17<14:22:48, 170.29s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002328587230294943, 'learning_rate': 1e-05, 'num_tokens': 647102202.0, 'completions/mean_length': 7161.3515625, 'completions/min_length': 1106.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6786.44677734375, 'completions/min_terminated_length': 1106.0, 'completions/max_terminated_length': 15887.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2012200653553009, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02022959291934967, 'sampling/sampling_logp_difference/max': 4.6981587409973145, 'sampling/importance_sampling_ratio/min': 0.0091120395809412, 'sampling/importance_sampling_ratio/mean': 1.0000085830688477, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.2057510313497914e-05, 'epoch': 0.66}
+
+ 70%|███████   | 720/1024 [32:20:17<14:22:48, 170.29s/it][AINFO 12-02 03:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:51:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 70%|███████   | 721/1024 [32:23:19<14:37:48, 173.82s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0010886890813708305, 'learning_rate': 1e-05, 'num_tokens': 648169241.0, 'completions/mean_length': 8178.1171875, 'completions/min_length': 1599.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7406.62451171875, 'completions/min_terminated_length': 1599.0, 'completions/max_terminated_length': 16375.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.12255740165710449, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.019835717976093292, 'sampling/sampling_logp_difference/max': 4.098509788513184, 'sampling/importance_sampling_ratio/min': 0.016597390174865723, 'sampling/importance_sampling_ratio/mean': 0.9999650120735168, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.269015476485947e-05, 'epoch': 0.66}
+
+ 70%|███████   | 721/1024 [32:23:19<14:37:48, 173.82s/it][AINFO 12-02 03:54:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:54:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:54:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:54:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 71%|███████   | 722/1024 [32:26:20<14:45:38, 175.95s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0024462107103317976, 'learning_rate': 1e-05, 'num_tokens': 649231573.0, 'completions/mean_length': 8138.34375, 'completions/min_length': 661.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7940.4482421875, 'completions/min_terminated_length': 661.0, 'completions/max_terminated_length': 16140.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3816363215446472, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021149171516299248, 'sampling/sampling_logp_difference/max': 2.942068576812744, 'sampling/importance_sampling_ratio/min': 0.05275648459792137, 'sampling/importance_sampling_ratio/mean': 1.0000066757202148, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.443379493248358e-05, 'epoch': 0.66}
+
+ 71%|███████   | 722/1024 [32:26:20<14:45:38, 175.95s/it][AINFO 12-02 03:57:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:57:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:57:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 03:57:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+[OpenTinker] 2025-12-02 03:59:44,259 - math_verify.grader - WARNING - Timeout during comparison
+
+ 71%|███████   | 723/1024 [32:29:34<15:09:47, 181.35s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018793430645018816, 'learning_rate': 1e-05, 'num_tokens': 650372981.0, 'completions/mean_length': 8725.5, 'completions/min_length': 1062.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7859.75634765625, 'completions/min_terminated_length': 1062.0, 'completions/max_terminated_length': 15133.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.022644322365522385, 'sampling/sampling_logp_difference/max': 2.9773528575897217, 'sampling/importance_sampling_ratio/min': 0.05092746764421463, 'sampling/importance_sampling_ratio/mean': 1.000082015991211, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.4028398797781847e-05, 'epoch': 0.67}
+
+ 71%|███████   | 723/1024 [32:29:34<15:09:47, 181.35s/it][AINFO 12-02 04:00:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:00:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:00:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:00:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 71%|███████   | 724/1024 [32:32:00<14:13:21, 170.67s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002970259403809905, 'learning_rate': 1e-05, 'num_tokens': 651193718.0, 'completions/mean_length': 6267.1953125, 'completions/min_length': 995.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6187.53564453125, 'completions/min_terminated_length': 995.0, 'completions/max_terminated_length': 14861.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.32825323939323425, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021695896983146667, 'sampling/sampling_logp_difference/max': 1.6695520877838135, 'sampling/importance_sampling_ratio/min': 0.18833139538764954, 'sampling/importance_sampling_ratio/mean': 0.9999880790710449, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.14388177634828e-05, 'epoch': 0.67}
+
+ 71%|███████   | 724/1024 [32:32:00<14:13:21, 170.67s/it][AINFO 12-02 04:03:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:03:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:03:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:03:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 71%|███████   | 725/1024 [32:34:51<14:10:38, 170.70s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019801442977041006, 'learning_rate': 1e-05, 'num_tokens': 652285208.0, 'completions/mean_length': 8370.015625, 'completions/min_length': 1136.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7464.08642578125, 'completions/min_terminated_length': 1136.0, 'completions/max_terminated_length': 16307.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29432153701782227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019409900531172752, 'sampling/sampling_logp_difference/max': 2.075321674346924, 'sampling/importance_sampling_ratio/min': 0.12551604211330414, 'sampling/importance_sampling_ratio/mean': 1.0000568628311157, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.658897063971381e-05, 'epoch': 0.67}
+
+ 71%|███████   | 725/1024 [32:34:51<14:10:38, 170.70s/it][AINFO 12-02 04:06:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:06:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 71%|███████   | 726/1024 [32:38:26<15:14:47, 184.19s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021733117755502462, 'learning_rate': 1e-05, 'num_tokens': 653332440.0, 'completions/mean_length': 8035.8125, 'completions/min_length': 871.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7250.9404296875, 'completions/min_terminated_length': 871.0, 'completions/max_terminated_length': 16334.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2109457403421402, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02135477401316166, 'sampling/sampling_logp_difference/max': 7.091533184051514, 'sampling/importance_sampling_ratio/min': 0.000832120596896857, 'sampling/importance_sampling_ratio/mean': 1.0000715255737305, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.2891542079814826e-05, 'epoch': 0.67}
+
+ 71%|███████   | 726/1024 [32:38:26<15:14:47, 184.19s/it][AINFO 12-02 04:09:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:09:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:09:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:09:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 71%|███████   | 727/1024 [32:41:51<15:41:52, 190.28s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001998819410800934, 'learning_rate': 1e-05, 'num_tokens': 654460539.0, 'completions/mean_length': 8667.5859375, 'completions/min_length': 870.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 8013.65234375, 'completions/min_terminated_length': 870.0, 'completions/max_terminated_length': 16308.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.172288179397583, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01987341418862343, 'sampling/sampling_logp_difference/max': 4.450218200683594, 'sampling/importance_sampling_ratio/min': 0.011676019057631493, 'sampling/importance_sampling_ratio/mean': 1.0000563859939575, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.537924467560515e-05, 'epoch': 0.67}
+
+ 71%|███████   | 727/1024 [32:41:51<15:41:52, 190.28s/it][AINFO 12-02 04:13:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:13:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:13:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:13:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 71%|███████   | 728/1024 [32:44:41<15:09:38, 184.39s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019672433845698833, 'learning_rate': 1e-05, 'num_tokens': 655471331.0, 'completions/mean_length': 7743.6875, 'completions/min_length': 1302.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7318.75390625, 'completions/min_terminated_length': 1302.0, 'completions/max_terminated_length': 16138.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.31694266200065613, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019686676561832428, 'sampling/sampling_logp_difference/max': 6.355484485626221, 'sampling/importance_sampling_ratio/min': 0.00173719332087785, 'sampling/importance_sampling_ratio/mean': 0.9999656081199646, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.907948388994555e-05, 'epoch': 0.67}
+
+ 71%|███████   | 728/1024 [32:44:41<15:09:38, 184.39s/it][AINFO 12-02 04:15:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:15:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:15:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:15:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 71%|███████   | 729/1024 [32:47:16<14:22:44, 175.47s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0007596180657856166, 'learning_rate': 1e-05, 'num_tokens': 656230273.0, 'completions/mean_length': 5776.984375, 'completions/min_length': 794.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5608.619140625, 'completions/min_terminated_length': 794.0, 'completions/max_terminated_length': 14700.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.12073516845703125, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.01932205818593502, 'sampling/sampling_logp_difference/max': 1.558834195137024, 'sampling/importance_sampling_ratio/min': 0.2103811800479889, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.288574089310714e-05, 'epoch': 0.67}
+
+ 71%|███████   | 729/1024 [32:47:16<14:22:44, 175.47s/it][AINFO 12-02 04:18:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:18:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:18:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:18:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 71%|███████▏  | 730/1024 [32:50:23<14:36:49, 178.94s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001592203276231885, 'learning_rate': 1e-05, 'num_tokens': 657381111.0, 'completions/mean_length': 8822.234375, 'completions/min_length': 1749.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 7893.5966796875, 'completions/min_terminated_length': 1749.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3016754686832428, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019806239753961563, 'sampling/sampling_logp_difference/max': 2.472311019897461, 'sampling/importance_sampling_ratio/min': 0.14609937369823456, 'sampling/importance_sampling_ratio/mean': 0.9999668598175049, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.940237067785347e-05, 'epoch': 0.67}
+
+ 71%|███████▏  | 730/1024 [32:50:23<14:36:49, 178.94s/it][AINFO 12-02 04:21:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:21:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:21:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:21:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 71%|███████▏  | 731/1024 [32:52:58<13:58:42, 171.75s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001881606294773519, 'learning_rate': 1e-05, 'num_tokens': 658182984.0, 'completions/mean_length': 6123.3828125, 'completions/min_length': 1140.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6042.59033203125, 'completions/min_terminated_length': 1140.0, 'completions/max_terminated_length': 16025.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.24777325987815857, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017632314935326576, 'sampling/sampling_logp_difference/max': 1.6216411590576172, 'sampling/importance_sampling_ratio/min': 0.197574183344841, 'sampling/importance_sampling_ratio/mean': 0.9999692440032959, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.147854576785903e-05, 'epoch': 0.67}
+
+ 71%|███████▏  | 731/1024 [32:52:58<13:58:42, 171.75s/it][AINFO 12-02 04:24:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:24:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:24:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:24:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 71%|███████▏  | 732/1024 [32:56:23<14:43:51, 181.62s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001493369578383863, 'learning_rate': 1e-05, 'num_tokens': 659247285.0, 'completions/mean_length': 8156.1015625, 'completions/min_length': 1857.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7533.82373046875, 'completions/min_terminated_length': 1857.0, 'completions/max_terminated_length': 15634.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2806568741798401, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020409945398569107, 'sampling/sampling_logp_difference/max': 2.4542205333709717, 'sampling/importance_sampling_ratio/min': 0.08593014627695084, 'sampling/importance_sampling_ratio/mean': 0.9999954700469971, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.451694460134604e-05, 'epoch': 0.67}
+
+ 71%|███████▏  | 732/1024 [32:56:23<14:43:51, 181.62s/it][AINFO 12-02 04:27:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:27:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:27:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:27:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 72%|███████▏  | 733/1024 [32:59:16<14:28:29, 179.07s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001374746672809124, 'learning_rate': 1e-05, 'num_tokens': 660165967.0, 'completions/mean_length': 7034.703125, 'completions/min_length': 842.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6574.9013671875, 'completions/min_terminated_length': 842.0, 'completions/max_terminated_length': 15953.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.23410367965698242, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020049314945936203, 'sampling/sampling_logp_difference/max': 5.874997615814209, 'sampling/importance_sampling_ratio/min': 0.00280880113132298, 'sampling/importance_sampling_ratio/mean': 1.000014305114746, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.855342410792218e-05, 'epoch': 0.67}
+
+ 72%|███████▏  | 733/1024 [32:59:16<14:28:29, 179.07s/it][AINFO 12-02 04:30:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:30:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 72%|███████▏  | 734/1024 [33:02:12<14:21:17, 178.20s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001424529473297298, 'learning_rate': 1e-05, 'num_tokens': 661155758.0, 'completions/mean_length': 7579.8671875, 'completions/min_length': 2223.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6992.92529296875, 'completions/min_terminated_length': 2223.0, 'completions/max_terminated_length': 16056.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.20964756608009338, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019834402948617935, 'sampling/sampling_logp_difference/max': 2.434292793273926, 'sampling/importance_sampling_ratio/min': 0.08765972405672073, 'sampling/importance_sampling_ratio/mean': 1.0000044107437134, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.2843028722927556e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 734/1024 [33:02:12<14:21:17, 178.20s/it][AINFO 12-02 04:33:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:33:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:33:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:33:29 [block_pool.py:292] Successfully reset prefix cache
+
+ 72%|███████▏  | 735/1024 [33:05:04<14:09:49, 176.43s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002382380422204733, 'learning_rate': 1e-05, 'num_tokens': 662125314.0, 'completions/mean_length': 7427.59375, 'completions/min_length': 1584.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7138.67724609375, 'completions/min_terminated_length': 1584.0, 'completions/max_terminated_length': 15349.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.18649454414844513, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020063366740942, 'sampling/sampling_logp_difference/max': 11.82745361328125, 'sampling/importance_sampling_ratio/min': 7.3013329711102415e-06, 'sampling/importance_sampling_ratio/mean': 1.0000174045562744, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.649659743132361e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 735/1024 [33:05:04<14:09:49, 176.43s/it][AINFO 12-02 04:36:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:36:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:36:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:36:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 72%|███████▏  | 736/1024 [33:07:43<13:41:49, 171.21s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001806004554964602, 'learning_rate': 1e-05, 'num_tokens': 663055015.0, 'completions/mean_length': 7115.4765625, 'completions/min_length': 952.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7042.49609375, 'completions/min_terminated_length': 952.0, 'completions/max_terminated_length': 14990.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.19833698868751526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02213163673877716, 'sampling/sampling_logp_difference/max': 2.2050986289978027, 'sampling/importance_sampling_ratio/min': 0.11023964732885361, 'sampling/importance_sampling_ratio/mean': 1.0000360012054443, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.974680194256507e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 736/1024 [33:07:43<13:41:49, 171.21s/it][AINFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:39:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 72%|███████▏  | 737/1024 [33:10:35<13:39:18, 171.28s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012251195730641484, 'learning_rate': 1e-05, 'num_tokens': 664078952.0, 'completions/mean_length': 7833.3828125, 'completions/min_length': 1342.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7485.79638671875, 'completions/min_terminated_length': 1342.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.29484066367149353, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018990887328982353, 'sampling/sampling_logp_difference/max': 4.7128520011901855, 'sampling/importance_sampling_ratio/min': 0.008979132398962975, 'sampling/importance_sampling_ratio/mean': 1.000061273574829, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.187703231968044e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 737/1024 [33:10:35<13:39:18, 171.28s/it][AINFO 12-02 04:41:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:41:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 72%|███████▏  | 738/1024 [33:13:18<13:25:08, 168.91s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012704364489763975, 'learning_rate': 1e-05, 'num_tokens': 664976672.0, 'completions/mean_length': 6855.875, 'completions/min_length': 1069.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6548.51611328125, 'completions/min_terminated_length': 1069.0, 'completions/max_terminated_length': 14956.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019447041675448418, 'sampling/sampling_logp_difference/max': 4.590576648712158, 'sampling/importance_sampling_ratio/min': 0.010147005319595337, 'sampling/importance_sampling_ratio/mean': 1.0000051259994507, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5142412855293514e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 738/1024 [33:13:18<13:25:08, 168.91s/it][AINFO 12-02 04:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:44:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:44:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 72%|███████▏  | 739/1024 [33:16:13<13:31:13, 170.78s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0011779989581555128, 'learning_rate': 1e-05, 'num_tokens': 665920949.0, 'completions/mean_length': 7214.0390625, 'completions/min_length': 926.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6993.96044921875, 'completions/min_terminated_length': 926.0, 'completions/max_terminated_length': 16074.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.16675648093223572, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019848698750138283, 'sampling/sampling_logp_difference/max': 2.698779582977295, 'sampling/importance_sampling_ratio/min': 0.06728758662939072, 'sampling/importance_sampling_ratio/mean': 1.00002121925354, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.184285830888257e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 739/1024 [33:16:13<13:31:13, 170.78s/it][AINFO 12-02 04:47:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:47:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:47:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:47:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 72%|███████▏  | 740/1024 [33:19:06<13:30:59, 171.34s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.00195691897533834, 'learning_rate': 1e-05, 'num_tokens': 666904129.0, 'completions/mean_length': 7544.59375, 'completions/min_length': 963.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6713.53857421875, 'completions/min_terminated_length': 963.0, 'completions/max_terminated_length': 16166.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2856566905975342, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019027139991521835, 'sampling/sampling_logp_difference/max': 2.0247929096221924, 'sampling/importance_sampling_ratio/min': 0.13202117383480072, 'sampling/importance_sampling_ratio/mean': 1.000046968460083, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.690341052082658e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 740/1024 [33:19:06<13:30:59, 171.34s/it][AINFO 12-02 04:50:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:50:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:50:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:50:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 72%|███████▏  | 741/1024 [33:21:46<13:12:48, 168.09s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002351365750655532, 'learning_rate': 1e-05, 'num_tokens': 667861474.0, 'completions/mean_length': 7264.7578125, 'completions/min_length': 809.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7124.6513671875, 'completions/min_terminated_length': 809.0, 'completions/max_terminated_length': 15870.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.30275392532348633, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02189580723643303, 'sampling/sampling_logp_difference/max': 2.582282066345215, 'sampling/importance_sampling_ratio/min': 0.07560127973556519, 'sampling/importance_sampling_ratio/mean': 1.0000860691070557, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.869810729564051e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 741/1024 [33:21:47<13:12:48, 168.09s/it][AINFO 12-02 04:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:53:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 72%|███████▏  | 742/1024 [33:24:54<13:37:50, 174.01s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0006935278652235866, 'learning_rate': 1e-05, 'num_tokens': 668993891.0, 'completions/mean_length': 8701.7578125, 'completions/min_length': 1379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 8257.330078125, 'completions/min_terminated_length': 1379.0, 'completions/max_terminated_length': 16213.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.0946863517165184, 'frac_reward_zero_std': 0.8125, 'sampling/sampling_logp_difference/mean': 0.019281234592199326, 'sampling/sampling_logp_difference/max': 2.5525050163269043, 'sampling/importance_sampling_ratio/min': 0.07788631319999695, 'sampling/importance_sampling_ratio/mean': 1.0000407695770264, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.0904622886064317e-05, 'epoch': 0.68}
+
+ 72%|███████▏  | 742/1024 [33:24:54<13:37:50, 174.01s/it][AINFO 12-02 04:56:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:56:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:56:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:56:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 73%|███████▎  | 743/1024 [33:27:32<13:12:33, 169.23s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002603888511657715, 'learning_rate': 1e-05, 'num_tokens': 669919989.0, 'completions/mean_length': 7092.328125, 'completions/min_length': 1287.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6714.61767578125, 'completions/min_terminated_length': 1287.0, 'completions/max_terminated_length': 15474.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.39424505829811096, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01848428137600422, 'sampling/sampling_logp_difference/max': 2.2494406700134277, 'sampling/importance_sampling_ratio/min': 0.10545819252729416, 'sampling/importance_sampling_ratio/mean': 0.9999926686286926, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.54721202156361e-05, 'epoch': 0.68}
+
+ 73%|███████▎  | 743/1024 [33:27:32<13:12:33, 169.23s/it][AINFO 12-02 04:58:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 04:58:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 73%|███████▎  | 744/1024 [33:30:31<13:22:50, 172.04s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0019989742431789637, 'learning_rate': 1e-05, 'num_tokens': 670847326.0, 'completions/mean_length': 7100.9453125, 'completions/min_length': 610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6723.5849609375, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 15511.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2120065987110138, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020306920632719994, 'sampling/sampling_logp_difference/max': 2.4030051231384277, 'sampling/importance_sampling_ratio/min': 0.09044574201107025, 'sampling/importance_sampling_ratio/mean': 1.0000207424163818, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.979742871251801e-05, 'epoch': 0.68}
+
+ 73%|███████▎  | 744/1024 [33:30:31<13:22:50, 172.04s/it][AINFO 12-02 05:01:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:01:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 73%|███████▎  | 745/1024 [33:33:51<13:58:42, 180.37s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015919266734272242, 'learning_rate': 1e-05, 'num_tokens': 671830200.0, 'completions/mean_length': 7532.578125, 'completions/min_length': 1138.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7172.76416015625, 'completions/min_terminated_length': 1138.0, 'completions/max_terminated_length': 16380.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.19332443177700043, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020771153271198273, 'sampling/sampling_logp_difference/max': 2.4045066833496094, 'sampling/importance_sampling_ratio/min': 0.09031002968549728, 'sampling/importance_sampling_ratio/mean': 1.0000309944152832, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.122653874423122e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 745/1024 [33:33:51<13:58:42, 180.37s/it][AINFO 12-02 05:05:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:05:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:05:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:05:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 73%|███████▎  | 746/1024 [33:36:56<14:02:52, 181.92s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0008248513913713396, 'learning_rate': 1e-05, 'num_tokens': 672836626.0, 'completions/mean_length': 7714.203125, 'completions/min_length': 750.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7212.64453125, 'completions/min_terminated_length': 750.0, 'completions/max_terminated_length': 15466.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.23410366475582123, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020917613059282303, 'sampling/sampling_logp_difference/max': 3.2305679321289062, 'sampling/importance_sampling_ratio/min': 0.0395350381731987, 'sampling/importance_sampling_ratio/mean': 1.0000051259994507, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.1657465292009874e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 746/1024 [33:36:56<14:02:52, 181.92s/it][AINFO 12-02 05:08:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:08:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:08:13 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:08:13 [block_pool.py:292] Successfully reset prefix cache
+
+ 73%|███████▎  | 747/1024 [33:40:13<14:19:54, 186.26s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001078522764146328, 'learning_rate': 1e-05, 'num_tokens': 673922380.0, 'completions/mean_length': 8338.203125, 'completions/min_length': 1300.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7801.81689453125, 'completions/min_terminated_length': 1300.0, 'completions/max_terminated_length': 16201.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.15991678833961487, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020228199660778046, 'sampling/sampling_logp_difference/max': 2.5985052585601807, 'sampling/importance_sampling_ratio/min': 0.07438468188047409, 'sampling/importance_sampling_ratio/mean': 1.000024437904358, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.080667465837905e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 747/1024 [33:40:13<14:19:54, 186.26s/it][AINFO 12-02 05:11:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:11:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:11:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:11:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 73%|███████▎  | 748/1024 [33:43:06<13:58:44, 182.34s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001997053623199463, 'learning_rate': 1e-05, 'num_tokens': 674923280.0, 'completions/mean_length': 7667.46875, 'completions/min_length': 1215.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7458.2724609375, 'completions/min_terminated_length': 1215.0, 'completions/max_terminated_length': 15290.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.1804162561893463, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.022770514711737633, 'sampling/sampling_logp_difference/max': 4.123444080352783, 'sampling/importance_sampling_ratio/min': 0.016188664361834526, 'sampling/importance_sampling_ratio/mean': 1.0000039339065552, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.5738096951499756e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 748/1024 [33:43:06<13:58:44, 182.34s/it][AINFO 12-02 05:14:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:14:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:14:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:14:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 73%|███████▎  | 749/1024 [33:46:10<13:58:14, 182.89s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0018654224695637822, 'learning_rate': 1e-05, 'num_tokens': 675990062.0, 'completions/mean_length': 8192.421875, 'completions/min_length': 1424.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7928.17724609375, 'completions/min_terminated_length': 1424.0, 'completions/max_terminated_length': 15790.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021591084077954292, 'sampling/sampling_logp_difference/max': 2.933685064315796, 'sampling/importance_sampling_ratio/min': 0.053200628608465195, 'sampling/importance_sampling_ratio/mean': 0.9999675750732422, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3797306830128946e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 749/1024 [33:46:10<13:58:14, 182.89s/it][AINFO 12-02 05:17:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:17:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 73%|███████▎  | 750/1024 [33:49:07<13:46:37, 181.01s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015752243343740702, 'learning_rate': 1e-05, 'num_tokens': 677024771.0, 'completions/mean_length': 7940.1640625, 'completions/min_length': 1478.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7524.89306640625, 'completions/min_terminated_length': 1478.0, 'completions/max_terminated_length': 15977.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01934216357767582, 'sampling/sampling_logp_difference/max': 7.749996185302734, 'sampling/importance_sampling_ratio/min': 0.0004307441704440862, 'sampling/importance_sampling_ratio/mean': 0.9999838471412659, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.693537630373612e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 750/1024 [33:49:07<13:46:37, 181.01s/it][AINFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache
+
+ 73%|███████▎  | 751/1024 [33:52:02<13:36:11, 179.38s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0009091979009099305, 'learning_rate': 1e-05, 'num_tokens': 678088139.0, 'completions/mean_length': 8160.125, 'completions/min_length': 1889.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7962.75244140625, 'completions/min_terminated_length': 1889.0, 'completions/max_terminated_length': 15841.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019523944705724716, 'sampling/sampling_logp_difference/max': 2.2135276794433594, 'sampling/importance_sampling_ratio/min': 0.10931434482336044, 'sampling/importance_sampling_ratio/mean': 1.0000240802764893, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1429076190979686e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 751/1024 [33:52:02<13:36:11, 179.38s/it][AINFO 12-02 05:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:23:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:23:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 73%|███████▎  | 752/1024 [33:55:02<13:33:47, 179.51s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0022500595077872276, 'learning_rate': 1e-05, 'num_tokens': 679203068.0, 'completions/mean_length': 8550.5703125, 'completions/min_length': 934.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8426.23046875, 'completions/min_terminated_length': 934.0, 'completions/max_terminated_length': 16213.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02082168310880661, 'sampling/sampling_logp_difference/max': 2.608529567718506, 'sampling/importance_sampling_ratio/min': 0.09096775203943253, 'sampling/importance_sampling_ratio/mean': 0.9999498128890991, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2988857987656957e-05, 'epoch': 0.69}
+
+ 73%|███████▎  | 752/1024 [33:55:02<13:33:47, 179.51s/it][AINFO 12-02 05:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:26:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:26:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 74%|███████▎  | 753/1024 [33:57:59<13:26:46, 178.62s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002147505059838295, 'learning_rate': 1e-05, 'num_tokens': 680292173.0, 'completions/mean_length': 8312.8828125, 'completions/min_length': 1177.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8052.52392578125, 'completions/min_terminated_length': 1177.0, 'completions/max_terminated_length': 16359.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.345874547958374, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020419074222445488, 'sampling/sampling_logp_difference/max': 12.062121391296387, 'sampling/importance_sampling_ratio/min': 5.774139026470948e-06, 'sampling/importance_sampling_ratio/mean': 1.0000628232955933, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.488336111942772e-05, 'epoch': 0.69}
+
+ 74%|███████▎  | 753/1024 [33:57:59<13:26:46, 178.62s/it][AINFO 12-02 05:29:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:29:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:29:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:29:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 74%|███████▎  | 754/1024 [34:00:50<13:13:31, 176.34s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016695107333362103, 'learning_rate': 1e-05, 'num_tokens': 681352884.0, 'completions/mean_length': 8111.6796875, 'completions/min_length': 1759.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7844.83056640625, 'completions/min_terminated_length': 1759.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.23987272381782532, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021483460441231728, 'sampling/sampling_logp_difference/max': 1.5743496417999268, 'sampling/importance_sampling_ratio/min': 0.20714221894741058, 'sampling/importance_sampling_ratio/mean': 1.0000377893447876, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.749491859001864e-05, 'epoch': 0.69}
+
+ 74%|███████▎  | 754/1024 [34:00:50<13:13:31, 176.34s/it][AINFO 12-02 05:32:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:32:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:32:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:32:06 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 74%|███████▎  | 755/1024 [34:03:50<13:16:21, 177.63s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012720171362161636, 'learning_rate': 1e-05, 'num_tokens': 682500327.0, 'completions/mean_length': 8822.7109375, 'completions/min_length': 1691.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8318.625, 'completions/min_terminated_length': 1691.0, 'completions/max_terminated_length': 16138.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.28353503346443176, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021024532616138458, 'sampling/sampling_logp_difference/max': 2.57590389251709, 'sampling/importance_sampling_ratio/min': 0.07608501613140106, 'sampling/importance_sampling_ratio/mean': 0.9999703168869019, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.131184491347085e-05, 'epoch': 0.69}
+
+ 74%|███████▎  | 755/1024 [34:03:50<13:16:21, 177.63s/it][AINFO 12-02 05:35:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:35:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:35:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:35:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 74%|███████▍  | 756/1024 [34:06:15<12:29:25, 167.78s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002312050899490714, 'learning_rate': 1e-05, 'num_tokens': 683376241.0, 'completions/mean_length': 6670.953125, 'completions/min_length': 1272.0, 'completions/max_length': 15919.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6670.953125, 'completions/min_terminated_length': 1272.0, 'completions/max_terminated_length': 15919.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.21276308596134186, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01984693855047226, 'sampling/sampling_logp_difference/max': 1.6207530498504639, 'sampling/importance_sampling_ratio/min': 0.19774973392486572, 'sampling/importance_sampling_ratio/mean': 0.999997079372406, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.141915667332796e-05, 'epoch': 0.7}
+
+ 74%|███████▍  | 756/1024 [34:06:15<12:29:25, 167.78s/it][AINFO 12-02 05:37:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:37:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:37:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:37:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 74%|███████▍  | 757/1024 [34:08:59<12:21:27, 166.62s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014584095915779471, 'learning_rate': 1e-05, 'num_tokens': 684324141.0, 'completions/mean_length': 7275.21875, 'completions/min_length': 689.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7130.63525390625, 'completions/min_terminated_length': 689.0, 'completions/max_terminated_length': 16161.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02093195915222168, 'sampling/sampling_logp_difference/max': 2.4190683364868164, 'sampling/importance_sampling_ratio/min': 0.19059965014457703, 'sampling/importance_sampling_ratio/mean': 1.0000598430633545, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.6458126462312066e-05, 'epoch': 0.7}
+
+ 74%|███████▍  | 757/1024 [34:08:59<12:21:27, 166.62s/it][AINFO 12-02 05:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:40:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:40:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 74%|███████▍  | 758/1024 [34:11:53<12:27:53, 168.70s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0009590767440386117, 'learning_rate': 1e-05, 'num_tokens': 685268861.0, 'completions/mean_length': 7225.75, 'completions/min_length': 1489.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7080.38134765625, 'completions/min_terminated_length': 1489.0, 'completions/max_terminated_length': 15644.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.19226360321044922, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021962076425552368, 'sampling/sampling_logp_difference/max': 2.3160719871520996, 'sampling/importance_sampling_ratio/min': 0.09866036474704742, 'sampling/importance_sampling_ratio/mean': 1.0000410079956055, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.168988610013912e-05, 'epoch': 0.7}
+
+ 74%|███████▍  | 758/1024 [34:11:53<12:27:53, 168.70s/it][AINFO 12-02 05:43:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:43:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:43:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:43:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 74%|███████▍  | 759/1024 [34:14:37<12:19:01, 167.33s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020776265300810337, 'learning_rate': 1e-05, 'num_tokens': 686198811.0, 'completions/mean_length': 7109.546875, 'completions/min_length': 827.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6962.33349609375, 'completions/min_terminated_length': 827.0, 'completions/max_terminated_length': 16199.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019300073385238647, 'sampling/sampling_logp_difference/max': 7.034051418304443, 'sampling/importance_sampling_ratio/min': 0.0008813538588583469, 'sampling/importance_sampling_ratio/mean': 1.0000050067901611, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.737358281341585e-05, 'epoch': 0.7}
+
+ 74%|███████▍  | 759/1024 [34:14:37<12:19:01, 167.33s/it][AINFO 12-02 05:45:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:45:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:45:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:45:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 74%|███████▍  | 760/1024 [34:17:33<12:28:10, 170.04s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0023633502423763275, 'learning_rate': 1e-05, 'num_tokens': 687151142.0, 'completions/mean_length': 7281.2109375, 'completions/min_length': 931.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6833.5322265625, 'completions/min_terminated_length': 931.0, 'completions/max_terminated_length': 16252.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.23068872094154358, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021235153079032898, 'sampling/sampling_logp_difference/max': 2.2978432178497314, 'sampling/importance_sampling_ratio/min': 0.10047531127929688, 'sampling/importance_sampling_ratio/mean': 1.0001262426376343, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.181574636026198e-05, 'epoch': 0.7}
+
+ 74%|███████▍  | 760/1024 [34:17:33<12:28:10, 170.04s/it][AINFO 12-02 05:48:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:48:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:48:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:48:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 74%|███████▍  | 761/1024 [34:20:34<12:40:17, 173.45s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0011378803756088018, 'learning_rate': 1e-05, 'num_tokens': 688102642.0, 'completions/mean_length': 7297.15625, 'completions/min_length': 1298.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7079.072265625, 'completions/min_terminated_length': 1298.0, 'completions/max_terminated_length': 16057.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.15702588856220245, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.0210577305406332, 'sampling/sampling_logp_difference/max': 2.0020718574523926, 'sampling/importance_sampling_ratio/min': 0.13505518436431885, 'sampling/importance_sampling_ratio/mean': 1.000117540359497, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.071103040179878e-05, 'epoch': 0.7}
+
+ 74%|███████▍  | 761/1024 [34:20:34<12:40:17, 173.45s/it][AINFO 12-02 05:51:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:51:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:51:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:51:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 74%|███████▍  | 762/1024 [34:23:32<12:42:14, 174.56s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002737557515501976, 'learning_rate': 1e-05, 'num_tokens': 689122170.0, 'completions/mean_length': 7797.0, 'completions/min_length': 483.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7520.0, 'completions/min_terminated_length': 483.0, 'completions/max_terminated_length': 16266.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.24777324497699738, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02109168842434883, 'sampling/sampling_logp_difference/max': 2.619779586791992, 'sampling/importance_sampling_ratio/min': 0.07281891256570816, 'sampling/importance_sampling_ratio/mean': 1.0000017881393433, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.15831295363023e-05, 'epoch': 0.7}
+
+ 74%|███████▍  | 762/1024 [34:23:32<12:42:14, 174.56s/it][AINFO 12-02 05:54:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:54:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:54:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:54:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 75%|███████▍  | 763/1024 [34:26:19<12:30:09, 172.45s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002013716846704483, 'learning_rate': 1e-05, 'num_tokens': 690054321.0, 'completions/mean_length': 7132.3046875, 'completions/min_length': 1483.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7059.45654296875, 'completions/min_terminated_length': 1483.0, 'completions/max_terminated_length': 15524.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021316926926374435, 'sampling/sampling_logp_difference/max': 1.6584957838058472, 'sampling/importance_sampling_ratio/min': 0.19042521715164185, 'sampling/importance_sampling_ratio/mean': 1.0001296997070312, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6529790179429256e-05, 'epoch': 0.7}
+
+ 75%|███████▍  | 763/1024 [34:26:19<12:30:09, 172.45s/it][AINFO 12-02 05:57:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:57:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:57:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 05:57:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 75%|███████▍  | 764/1024 [34:29:17<12:34:30, 174.12s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017455084016546607, 'learning_rate': 1e-05, 'num_tokens': 691137869.0, 'completions/mean_length': 8325.21875, 'completions/min_length': 1595.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7642.27099609375, 'completions/min_terminated_length': 1595.0, 'completions/max_terminated_length': 15811.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.37822139263153076, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01899132877588272, 'sampling/sampling_logp_difference/max': 3.1942214965820312, 'sampling/importance_sampling_ratio/min': 0.0409984290599823, 'sampling/importance_sampling_ratio/mean': 0.9999898076057434, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.929428167903097e-05, 'epoch': 0.7}
+
+ 75%|███████▍  | 764/1024 [34:29:17<12:34:30, 174.12s/it][AINFO 12-02 06:00:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:00:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:00:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:00:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 75%|███████▍  | 765/1024 [34:32:02<12:19:31, 171.32s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002027466893196106, 'learning_rate': 1e-05, 'num_tokens': 692015717.0, 'completions/mean_length': 6619.8125, 'completions/min_length': 293.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6304.83837890625, 'completions/min_terminated_length': 293.0, 'completions/max_terminated_length': 15033.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.22225631773471832, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019295530393719673, 'sampling/sampling_logp_difference/max': 3.301266670227051, 'sampling/importance_sampling_ratio/min': 0.03683647885918617, 'sampling/importance_sampling_ratio/mean': 1.00004243850708, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.963070821555448e-05, 'epoch': 0.7}
+
+ 75%|███████▍  | 765/1024 [34:32:02<12:19:31, 171.32s/it][AINFO 12-02 06:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:03:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:03:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 75%|███████▍  | 766/1024 [34:34:43<12:03:25, 168.24s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021248471457511187, 'learning_rate': 1e-05, 'num_tokens': 692961551.0, 'completions/mean_length': 7245.078125, 'completions/min_length': 847.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6950.27392578125, 'completions/min_terminated_length': 847.0, 'completions/max_terminated_length': 16079.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019905809313058853, 'sampling/sampling_logp_difference/max': 5.018924236297607, 'sampling/importance_sampling_ratio/min': 0.006611635442823172, 'sampling/importance_sampling_ratio/mean': 0.9999723434448242, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.9428159880117164e-05, 'epoch': 0.7}
+
+ 75%|███████▍  | 766/1024 [34:34:43<12:03:25, 168.24s/it][AINFO 12-02 06:06:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:06:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:06:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:06:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 75%|███████▍  | 767/1024 [34:37:54<12:29:53, 175.07s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0016979157226160169, 'learning_rate': 1e-05, 'num_tokens': 694131370.0, 'completions/mean_length': 8980.8984375, 'completions/min_length': 948.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 8144.02587890625, 'completions/min_terminated_length': 948.0, 'completions/max_terminated_length': 16212.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.21648237109184265, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02024347335100174, 'sampling/sampling_logp_difference/max': 2.5362000465393066, 'sampling/importance_sampling_ratio/min': 0.07916665822267532, 'sampling/importance_sampling_ratio/mean': 0.9999918937683105, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.472207385537331e-05, 'epoch': 0.71}
+
+ 75%|███████▍  | 767/1024 [34:37:54<12:29:53, 175.07s/it][AINFO 12-02 06:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:09:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:09:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 75%|███████▌  | 768/1024 [34:40:31<12:03:34, 169.59s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001519337180070579, 'learning_rate': 1e-05, 'num_tokens': 695077531.0, 'completions/mean_length': 7210.0703125, 'completions/min_length': 1593.0, 'completions/max_length': 15685.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7210.0703125, 'completions/min_terminated_length': 1593.0, 'completions/max_terminated_length': 15685.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2756393849849701, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01942768692970276, 'sampling/sampling_logp_difference/max': 5.939518928527832, 'sampling/importance_sampling_ratio/min': 0.002633296186104417, 'sampling/importance_sampling_ratio/mean': 0.9999686479568481, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.0282319559992175e-05, 'epoch': 0.71}
+
+ 75%|███████▌  | 768/1024 [34:40:31<12:03:34, 169.59s/it][AINFO 12-02 06:11:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:11:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:11:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:11:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 75%|███████▌  | 769/1024 [34:43:59<12:49:48, 181.13s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0012529873056337237, 'learning_rate': 1e-05, 'num_tokens': 696373454.0, 'completions/mean_length': 9976.4609375, 'completions/min_length': 1269.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 9313.6123046875, 'completions/min_terminated_length': 1269.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019226733595132828, 'sampling/sampling_logp_difference/max': 5.624955177307129, 'sampling/importance_sampling_ratio/min': 0.003606724552810192, 'sampling/importance_sampling_ratio/mean': 1.0000343322753906, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.4677149364579236e-05, 'epoch': 0.71}
+
+ 75%|███████▌  | 769/1024 [34:43:59<12:49:48, 181.13s/it][AINFO 12-02 06:15:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:15:16 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 75%|███████▌  | 770/1024 [34:46:37<12:17:23, 174.19s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0020386227406561375, 'learning_rate': 1e-05, 'num_tokens': 697339056.0, 'completions/mean_length': 7374.515625, 'completions/min_length': 1233.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7231.50830078125, 'completions/min_terminated_length': 1233.0, 'completions/max_terminated_length': 16342.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.30744943022727966, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019161051139235497, 'sampling/sampling_logp_difference/max': 2.6830196380615234, 'sampling/importance_sampling_ratio/min': 0.06835643202066422, 'sampling/importance_sampling_ratio/mean': 0.9999449253082275, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.062856914468284e-05, 'epoch': 0.71}
+
+ 75%|███████▌  | 770/1024 [34:46:37<12:17:23, 174.19s/it][AINFO 12-02 06:17:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:17:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:17:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:17:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 75%|███████▌  | 771/1024 [34:49:31<12:14:50, 174.27s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001156375976279378, 'learning_rate': 1e-05, 'num_tokens': 698317684.0, 'completions/mean_length': 7492.09375, 'completions/min_length': 1019.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6977.685546875, 'completions/min_terminated_length': 1019.0, 'completions/max_terminated_length': 16205.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.21254336833953857, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019535014405846596, 'sampling/sampling_logp_difference/max': 3.4340500831604004, 'sampling/importance_sampling_ratio/min': 0.03225603699684143, 'sampling/importance_sampling_ratio/mean': 1.0000190734863281, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.597679668449928e-05, 'epoch': 0.71}
+
+ 75%|███████▌  | 771/1024 [34:49:31<12:14:50, 174.27s/it][AINFO 12-02 06:20:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:20:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:20:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:20:48 [block_pool.py:292] Successfully reset prefix cache
+
+ 75%|███████▌  | 772/1024 [34:52:28<12:14:55, 174.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020900629460811615, 'learning_rate': 1e-05, 'num_tokens': 699285648.0, 'completions/mean_length': 7367.15625, 'completions/min_length': 1526.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7224.0322265625, 'completions/min_terminated_length': 1526.0, 'completions/max_terminated_length': 16109.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2751026153564453, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022344589233398438, 'sampling/sampling_logp_difference/max': 2.3687334060668945, 'sampling/importance_sampling_ratio/min': 0.09359920024871826, 'sampling/importance_sampling_ratio/mean': 1.0000989437103271, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.313701406819746e-05, 'epoch': 0.71}
+
+ 75%|███████▌  | 772/1024 [34:52:28<12:14:55, 174.98s/it][AINFO 12-02 06:23:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:23:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:23:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:23:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 75%|███████▌  | 773/1024 [34:55:37<12:29:24, 179.14s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002543453825637698, 'learning_rate': 1e-05, 'num_tokens': 700455556.0, 'completions/mean_length': 8984.53125, 'completions/min_length': 1192.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8491.2333984375, 'completions/min_terminated_length': 1192.0, 'completions/max_terminated_length': 15946.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.33274173736572266, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02012353017926216, 'sampling/sampling_logp_difference/max': 3.525960922241211, 'sampling/importance_sampling_ratio/min': 0.02942351996898651, 'sampling/importance_sampling_ratio/mean': 1.0000981092453003, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.007537149343989e-05, 'epoch': 0.71}
+
+ 75%|███████▌  | 773/1024 [34:55:37<12:29:24, 179.14s/it][AINFO 12-02 06:26:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:26:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 76%|███████▌  | 774/1024 [34:58:30<12:19:23, 177.45s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0015105409547686577, 'learning_rate': 1e-05, 'num_tokens': 701499973.0, 'completions/mean_length': 7982.6328125, 'completions/min_length': 945.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7270.65234375, 'completions/min_terminated_length': 945.0, 'completions/max_terminated_length': 16362.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.1990984082221985, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02272830158472061, 'sampling/sampling_logp_difference/max': 5.716393947601318, 'sampling/importance_sampling_ratio/min': 0.003291559172794223, 'sampling/importance_sampling_ratio/mean': 1.0000369548797607, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.691575009019289e-05, 'epoch': 0.71}
+
+ 76%|███████▌  | 774/1024 [34:58:30<12:19:23, 177.45s/it][AINFO 12-02 06:29:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:29:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 76%|███████▌  | 775/1024 [35:01:09<11:53:15, 171.87s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0034643695689737797, 'learning_rate': 1e-05, 'num_tokens': 702344981.0, 'completions/mean_length': 6460.25, 'completions/min_length': 1046.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5972.1962890625, 'completions/min_terminated_length': 1046.0, 'completions/max_terminated_length': 15656.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2041109800338745, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.016874466091394424, 'sampling/sampling_logp_difference/max': 1.628553867340088, 'sampling/importance_sampling_ratio/min': 0.19621311128139496, 'sampling/importance_sampling_ratio/mean': 1.0000231266021729, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.22118523097015e-05, 'epoch': 0.71}
+
+ 76%|███████▌  | 775/1024 [35:01:09<11:53:15, 171.87s/it][AINFO 12-02 06:32:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:32:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:32:26 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:32:26 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 76%|███████▌  | 776/1024 [35:04:27<12:23:06, 179.78s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015479361172765493, 'learning_rate': 1e-05, 'num_tokens': 703581802.0, 'completions/mean_length': 9527.1640625, 'completions/min_length': 1179.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 9008.580078125, 'completions/min_terminated_length': 1179.0, 'completions/max_terminated_length': 16340.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3448137044906616, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018863730132579803, 'sampling/sampling_logp_difference/max': 9.598800659179688, 'sampling/importance_sampling_ratio/min': 6.781001866329461e-05, 'sampling/importance_sampling_ratio/mean': 1.0000441074371338, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.805525003983348e-05, 'epoch': 0.71}
+
+ 76%|███████▌  | 776/1024 [35:04:27<12:23:06, 179.78s/it][AINFO 12-02 06:35:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:35:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:35:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:35:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 76%|███████▌  | 777/1024 [35:07:25<12:17:38, 179.18s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019267711322754622, 'learning_rate': 1e-05, 'num_tokens': 704534307.0, 'completions/mean_length': 7302.0703125, 'completions/min_length': 1213.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6696.60888671875, 'completions/min_terminated_length': 1213.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.11913755536079407, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.019741591066122055, 'sampling/sampling_logp_difference/max': 1.9727998971939087, 'sampling/importance_sampling_ratio/min': 0.1390669345855713, 'sampling/importance_sampling_ratio/mean': 0.9999661445617676, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9503188045509887e-05, 'epoch': 0.71}
+
+ 76%|███████▌  | 777/1024 [35:07:25<12:17:38, 179.18s/it][AINFO 12-02 06:38:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:38:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:38:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:38:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 76%|███████▌  | 778/1024 [35:10:19<12:08:23, 177.66s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021035161335021257, 'learning_rate': 1e-05, 'num_tokens': 705595245.0, 'completions/mean_length': 8138.265625, 'completions/min_length': 1065.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7588.55029296875, 'completions/min_terminated_length': 1065.0, 'completions/max_terminated_length': 15796.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2227931022644043, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021144188940525055, 'sampling/sampling_logp_difference/max': 13.448639869689941, 'sampling/importance_sampling_ratio/min': 1.443211317564419e-06, 'sampling/importance_sampling_ratio/mean': 0.9999861121177673, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.467682163067366e-05, 'epoch': 0.72}
+
+ 76%|███████▌  | 778/1024 [35:10:19<12:08:23, 177.66s/it][AINFO 12-02 06:41:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:41:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:41:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:41:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 76%|███████▌  | 779/1024 [35:13:10<11:57:16, 175.66s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002917584730312228, 'learning_rate': 1e-05, 'num_tokens': 706493593.0, 'completions/mean_length': 6860.96875, 'completions/min_length': 1076.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6310.04931640625, 'completions/min_terminated_length': 1076.0, 'completions/max_terminated_length': 16225.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2396402657032013, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02070399932563305, 'sampling/sampling_logp_difference/max': 2.252303123474121, 'sampling/importance_sampling_ratio/min': 0.10515675693750381, 'sampling/importance_sampling_ratio/mean': 1.000055193901062, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.737031704531546e-05, 'epoch': 0.72}
+
+ 76%|███████▌  | 779/1024 [35:13:10<11:57:16, 175.66s/it][AINFO 12-02 06:44:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:44:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:44:27 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:44:27 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 76%|███████▌  | 780/1024 [35:16:17<12:07:51, 178.98s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0022225689608603716, 'learning_rate': 1e-05, 'num_tokens': 707671056.0, 'completions/mean_length': 9049.6796875, 'completions/min_length': 2216.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 8625.3798828125, 'completions/min_terminated_length': 2216.0, 'completions/max_terminated_length': 16337.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.326668381690979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01939445175230503, 'sampling/sampling_logp_difference/max': 2.4110021591186523, 'sampling/importance_sampling_ratio/min': 0.08972533792257309, 'sampling/importance_sampling_ratio/mean': 1.0000156164169312, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.136977737649431e-05, 'epoch': 0.72}
+
+ 76%|███████▌  | 780/1024 [35:16:17<12:07:51, 178.98s/it][AINFO 12-02 06:47:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:47:34 [block_pool.py:292] Successfully reset prefix cache
+
+ 76%|███████▋  | 781/1024 [35:19:01<11:46:30, 174.45s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0022880490869283676, 'learning_rate': 1e-05, 'num_tokens': 708547257.0, 'completions/mean_length': 6683.0703125, 'completions/min_length': 1217.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6450.24853515625, 'completions/min_terminated_length': 1217.0, 'completions/max_terminated_length': 16113.0, 'rewards/accuracy_reward/mean': 0.6640625, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.6640625, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019364919513463974, 'sampling/sampling_logp_difference/max': 4.845625877380371, 'sampling/importance_sampling_ratio/min': 0.00786269549280405, 'sampling/importance_sampling_ratio/mean': 0.9999892115592957, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.903869873691292e-05, 'epoch': 0.72}
+
+ 76%|███████▋  | 781/1024 [35:19:01<11:46:30, 174.45s/it][AINFO 12-02 06:50:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:50:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:50:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:50:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 76%|███████▋  | 782/1024 [35:21:45<11:30:59, 171.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007747720810584724, 'learning_rate': 1e-05, 'num_tokens': 709503329.0, 'completions/mean_length': 7316.125, 'completions/min_length': 919.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7244.724609375, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 16278.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2227931171655655, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020081840455532074, 'sampling/sampling_logp_difference/max': 2.6363892555236816, 'sampling/importance_sampling_ratio/min': 0.07161939889192581, 'sampling/importance_sampling_ratio/mean': 1.0000340938568115, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.91392852129502e-05, 'epoch': 0.72}
+
+ 76%|███████▋  | 782/1024 [35:21:45<11:30:59, 171.32s/it][AINFO 12-02 06:53:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:53:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:53:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:53:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 76%|███████▋  | 783/1024 [35:24:41<11:33:20, 172.61s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0011829741997644305, 'learning_rate': 1e-05, 'num_tokens': 710662211.0, 'completions/mean_length': 8881.828125, 'completions/min_length': 1429.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8381.68359375, 'completions/min_terminated_length': 1429.0, 'completions/max_terminated_length': 15760.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2025182545185089, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020823199301958084, 'sampling/sampling_logp_difference/max': 3.4221458435058594, 'sampling/importance_sampling_ratio/min': 0.032642316073179245, 'sampling/importance_sampling_ratio/mean': 1.000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.357791371172425e-05, 'epoch': 0.72}
+
+ 76%|███████▋  | 783/1024 [35:24:41<11:33:20, 172.61s/it][AINFO 12-02 06:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:55:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:55:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 77%|███████▋  | 784/1024 [35:27:36<11:34:20, 173.59s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015819448744878173, 'learning_rate': 1e-05, 'num_tokens': 711714453.0, 'completions/mean_length': 8061.890625, 'completions/min_length': 1620.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7652.60595703125, 'completions/min_terminated_length': 1620.0, 'completions/max_terminated_length': 15926.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.344813734292984, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02047148160636425, 'sampling/sampling_logp_difference/max': 3.271787166595459, 'sampling/importance_sampling_ratio/min': 0.03793856501579285, 'sampling/importance_sampling_ratio/mean': 1.0000104904174805, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.395261470970581e-05, 'epoch': 0.72}
+
+ 77%|███████▋  | 784/1024 [35:27:36<11:34:20, 173.59s/it][AINFO 12-02 06:58:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:58:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:58:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 06:58:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 77%|███████▋  | 785/1024 [35:30:39<11:42:00, 176.24s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013292088406160474, 'learning_rate': 1e-05, 'num_tokens': 712810401.0, 'completions/mean_length': 8395.71875, 'completions/min_length': 1142.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8138.0322265625, 'completions/min_terminated_length': 1142.0, 'completions/max_terminated_length': 15953.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020755447447299957, 'sampling/sampling_logp_difference/max': 3.874601364135742, 'sampling/importance_sampling_ratio/min': 0.02076261304318905, 'sampling/importance_sampling_ratio/mean': 0.9999847412109375, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.242292636216007e-05, 'epoch': 0.72}
+
+ 77%|███████▋  | 785/1024 [35:30:39<11:42:00, 176.24s/it][AINFO 12-02 07:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:01:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 77%|███████▋  | 786/1024 [35:33:09<11:07:44, 168.34s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0038182721473276615, 'learning_rate': 1e-05, 'num_tokens': 713661144.0, 'completions/mean_length': 6470.5546875, 'completions/min_length': 697.0, 'completions/max_length': 15884.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6470.5546875, 'completions/min_terminated_length': 697.0, 'completions/max_terminated_length': 15884.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.20069602131843567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020029950886964798, 'sampling/sampling_logp_difference/max': 2.091876268386841, 'sampling/importance_sampling_ratio/min': 0.2018069177865982, 'sampling/importance_sampling_ratio/mean': 1.000002145767212, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.141546392271266e-05, 'epoch': 0.72}
+
+ 77%|███████▋  | 786/1024 [35:33:09<11:07:44, 168.34s/it][AINFO 12-02 07:04:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:04:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:04:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:04:25 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 77%|███████▋  | 787/1024 [35:36:27<11:40:21, 177.30s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016282083233818412, 'learning_rate': 1e-05, 'num_tokens': 714836059.0, 'completions/mean_length': 9027.8984375, 'completions/min_length': 1377.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 8404.5, 'completions/min_terminated_length': 1377.0, 'completions/max_terminated_length': 16272.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020558331161737442, 'sampling/sampling_logp_difference/max': 2.3074402809143066, 'sampling/importance_sampling_ratio/min': 0.09951566159725189, 'sampling/importance_sampling_ratio/mean': 1.0000088214874268, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.0778362037817715e-05, 'epoch': 0.72}
+
+ 77%|███████▋  | 787/1024 [35:36:27<11:40:21, 177.30s/it][AINFO 12-02 07:07:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:07:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 77%|███████▋  | 788/1024 [35:39:28<11:42:11, 178.52s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0013070562854409218, 'learning_rate': 1e-05, 'num_tokens': 715874011.0, 'completions/mean_length': 7940.4375, 'completions/min_length': 718.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7451.966796875, 'completions/min_terminated_length': 718.0, 'completions/max_terminated_length': 16068.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02230507880449295, 'sampling/sampling_logp_difference/max': 6.8359503746032715, 'sampling/importance_sampling_ratio/min': 0.001074445666745305, 'sampling/importance_sampling_ratio/mean': 0.9999250173568726, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.8969428235068335e-05, 'epoch': 0.72}
+
+ 77%|███████▋  | 788/1024 [35:39:28<11:42:11, 178.52s/it][AINFO 12-02 07:10:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:10:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:10:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:10:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 77%|███████▋  | 789/1024 [35:42:03<11:10:47, 171.27s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.00262087513692677, 'learning_rate': 1e-05, 'num_tokens': 716857471.0, 'completions/mean_length': 7546.03125, 'completions/min_length': 1517.0, 'completions/max_length': 15977.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7546.03125, 'completions/min_terminated_length': 1517.0, 'completions/max_terminated_length': 15977.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3379838466644287, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019527148455381393, 'sampling/sampling_logp_difference/max': 3.3157267570495605, 'sampling/importance_sampling_ratio/min': 0.03630765154957771, 'sampling/importance_sampling_ratio/mean': 1.0000114440917969, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.18476282802294e-05, 'epoch': 0.73}
+
+ 77%|███████▋  | 789/1024 [35:42:03<11:10:47, 171.27s/it][AINFO 12-02 07:13:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:13:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:13:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:13:19 [block_pool.py:292] Successfully reset prefix cache
+
+ 77%|███████▋  | 790/1024 [35:44:50<11:03:44, 170.19s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0024619572795927525, 'learning_rate': 1e-05, 'num_tokens': 717731154.0, 'completions/mean_length': 6662.9609375, 'completions/min_length': 787.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6267.79638671875, 'completions/min_terminated_length': 787.0, 'completions/max_terminated_length': 16177.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02002793923020363, 'sampling/sampling_logp_difference/max': 2.820876121520996, 'sampling/importance_sampling_ratio/min': 0.05955374613404274, 'sampling/importance_sampling_ratio/mean': 1.0000267028808594, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.324314980091003e-05, 'epoch': 0.73}
+
+ 77%|███████▋  | 790/1024 [35:44:50<11:03:44, 170.19s/it][AINFO 12-02 07:16:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:16:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:16:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:16:07 [block_pool.py:292] Successfully reset prefix cache
+
+ 77%|███████▋  | 791/1024 [35:47:50<11:11:45, 172.99s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0007935090688988566, 'learning_rate': 1e-05, 'num_tokens': 718665035.0, 'completions/mean_length': 7127.1953125, 'completions/min_length': 1190.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6750.90234375, 'completions/min_terminated_length': 1190.0, 'completions/max_terminated_length': 16382.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.1344047486782074, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.018968485295772552, 'sampling/sampling_logp_difference/max': 2.452794313430786, 'sampling/importance_sampling_ratio/min': 0.0860527902841568, 'sampling/importance_sampling_ratio/mean': 1.0000171661376953, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.471330321895948e-05, 'epoch': 0.73}
+
+ 77%|███████▋  | 791/1024 [35:47:50<11:11:45, 172.99s/it][AINFO 12-02 07:19:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:19:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:19:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:19:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 77%|███████▋  | 792/1024 [35:50:27<10:51:05, 168.38s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002413337118923664, 'learning_rate': 1e-05, 'num_tokens': 719514466.0, 'completions/mean_length': 6478.6171875, 'completions/min_length': 1348.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6400.6220703125, 'completions/min_terminated_length': 1348.0, 'completions/max_terminated_length': 16248.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.21990221738815308, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02019413933157921, 'sampling/sampling_logp_difference/max': 2.8142311573028564, 'sampling/importance_sampling_ratio/min': 0.05995079129934311, 'sampling/importance_sampling_ratio/mean': 0.9999282360076904, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.076619597981335e-05, 'epoch': 0.73}
+
+ 77%|███████▋  | 792/1024 [35:50:27<10:51:05, 168.38s/it][AINFO 12-02 07:21:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:21:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:21:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:21:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 77%|███████▋  | 793/1024 [35:53:07<10:37:34, 165.61s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001896189758554101, 'learning_rate': 1e-05, 'num_tokens': 720344254.0, 'completions/mean_length': 6333.65625, 'completions/min_length': 1183.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6254.51953125, 'completions/min_terminated_length': 1183.0, 'completions/max_terminated_length': 15976.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01940145343542099, 'sampling/sampling_logp_difference/max': 2.480757474899292, 'sampling/importance_sampling_ratio/min': 0.08367981761693954, 'sampling/importance_sampling_ratio/mean': 1.0000386238098145, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.726810859916441e-05, 'epoch': 0.73}
+
+ 77%|███████▋  | 793/1024 [35:53:07<10:37:34, 165.61s/it][AINFO 12-02 07:24:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:24:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:24:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:24:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 794/1024 [35:55:53<10:35:20, 165.74s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002138897543773055, 'learning_rate': 1e-05, 'num_tokens': 721299121.0, 'completions/mean_length': 7274.3984375, 'completions/min_length': 509.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7055.7685546875, 'completions/min_terminated_length': 509.0, 'completions/max_terminated_length': 15874.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.21436560153961182, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019833650439977646, 'sampling/sampling_logp_difference/max': 2.9509506225585938, 'sampling/importance_sampling_ratio/min': 0.05228997394442558, 'sampling/importance_sampling_ratio/mean': 0.9999841451644897, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.2169948238115467e-05, 'epoch': 0.73}
+
+ 78%|███████▊  | 794/1024 [35:55:53<10:35:20, 165.74s/it][AINFO 12-02 07:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:27:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:27:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 795/1024 [35:58:35<10:28:45, 164.74s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0014769171830266714, 'learning_rate': 1e-05, 'num_tokens': 722209611.0, 'completions/mean_length': 6982.765625, 'completions/min_length': 1107.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6679.5, 'completions/min_terminated_length': 1107.0, 'completions/max_terminated_length': 16192.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.29432153701782227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018766991794109344, 'sampling/sampling_logp_difference/max': 10.42121410369873, 'sampling/importance_sampling_ratio/min': 2.9793685826007277e-05, 'sampling/importance_sampling_ratio/mean': 1.0000135898590088, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.980734679018497e-05, 'epoch': 0.73}
+
+ 78%|███████▊  | 795/1024 [35:58:35<10:28:45, 164.74s/it][AINFO 12-02 07:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:29:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 796/1024 [36:01:37<10:45:14, 169.80s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001591369742527604, 'learning_rate': 1e-05, 'num_tokens': 723348542.0, 'completions/mean_length': 8732.3984375, 'completions/min_length': 1293.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 8153.70654296875, 'completions/min_terminated_length': 1293.0, 'completions/max_terminated_length': 15145.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2580229640007019, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020438428968191147, 'sampling/sampling_logp_difference/max': 2.8545141220092773, 'sampling/importance_sampling_ratio/min': 0.05758379399776459, 'sampling/importance_sampling_ratio/mean': 1.0000627040863037, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.3875962091988185e-05, 'epoch': 0.73}
+
+ 78%|███████▊  | 796/1024 [36:01:37<10:45:14, 169.80s/it][AINFO 12-02 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:32:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 797/1024 [36:04:40<10:58:01, 173.93s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.001132588367909193, 'learning_rate': 1e-05, 'num_tokens': 724435175.0, 'completions/mean_length': 8311.0078125, 'completions/min_length': 1048.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7843.97509765625, 'completions/min_terminated_length': 1048.0, 'completions/max_terminated_length': 16358.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.16439256072044373, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02247772365808487, 'sampling/sampling_logp_difference/max': 2.456425666809082, 'sampling/importance_sampling_ratio/min': 0.08574086427688599, 'sampling/importance_sampling_ratio/mean': 0.9999547004699707, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.116801735814079e-05, 'epoch': 0.73}
+
+ 78%|███████▊  | 797/1024 [36:04:40<10:58:01, 173.93s/it][AINFO 12-02 07:35:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:35:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:35:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:35:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 78%|███████▊  | 798/1024 [36:07:41<11:02:32, 175.90s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.00132716353982687, 'learning_rate': 1e-05, 'num_tokens': 725488415.0, 'completions/mean_length': 8065.9375, 'completions/min_length': 768.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7361.01708984375, 'completions/min_terminated_length': 768.0, 'completions/max_terminated_length': 15491.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.26249873638153076, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021078957244753838, 'sampling/sampling_logp_difference/max': 2.5854902267456055, 'sampling/importance_sampling_ratio/min': 0.07535912841558456, 'sampling/importance_sampling_ratio/mean': 0.9999963641166687, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1580967263143975e-05, 'epoch': 0.73}
+
+ 78%|███████▊  | 798/1024 [36:07:41<11:02:32, 175.90s/it][AINFO 12-02 07:38:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:38:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:38:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:38:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 78%|███████▊  | 799/1024 [36:10:43<11:07:04, 177.88s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0016145105473697186, 'learning_rate': 1e-05, 'num_tokens': 726557255.0, 'completions/mean_length': 8185.875, 'completions/min_length': 1077.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8055.74658203125, 'completions/min_terminated_length': 1077.0, 'completions/max_terminated_length': 16032.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.25012245774269104, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021065544337034225, 'sampling/sampling_logp_difference/max': 3.635584831237793, 'sampling/importance_sampling_ratio/min': 0.026368509978055954, 'sampling/importance_sampling_ratio/mean': 1.0000089406967163, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7708056993324135e-05, 'epoch': 0.74}
+
+ 78%|███████▊  | 799/1024 [36:10:43<11:07:04, 177.88s/it][AINFO 12-02 07:42:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:42:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 800/1024 [36:13:28<10:49:17, 173.92s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0017410339787602425, 'learning_rate': 1e-05, 'num_tokens': 727533492.0, 'completions/mean_length': 7452.5390625, 'completions/min_length': 1178.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7310.77001953125, 'completions/min_terminated_length': 1178.0, 'completions/max_terminated_length': 15990.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.26037710905075073, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021367521956562996, 'sampling/sampling_logp_difference/max': 1.9986705780029297, 'sampling/importance_sampling_ratio/min': 0.13551531732082367, 'sampling/importance_sampling_ratio/mean': 0.9999819993972778, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.109855833827169e-05, 'epoch': 0.74}
+
+ 78%|███████▊  | 800/1024 [36:13:28<10:49:17, 173.92s/it][AINFO 12-02 07:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:44:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:44:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 801/1024 [36:16:19<10:43:38, 173.18s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002818804234266281, 'learning_rate': 1e-05, 'num_tokens': 728572223.0, 'completions/mean_length': 7949.7109375, 'completions/min_length': 1428.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7815.833984375, 'completions/min_terminated_length': 1428.0, 'completions/max_terminated_length': 16018.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2919674217700958, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02111595869064331, 'sampling/sampling_logp_difference/max': 3.9194600582122803, 'sampling/importance_sampling_ratio/min': 0.019851811230182648, 'sampling/importance_sampling_ratio/mean': 1.0000503063201904, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.21578107913956e-05, 'epoch': 0.74}
+
+ 78%|███████▊  | 801/1024 [36:16:19<10:43:38, 173.18s/it][AINFO 12-02 07:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:47:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:47:36 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 802/1024 [36:19:07<10:34:03, 171.37s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0019822493195533752, 'learning_rate': 1e-05, 'num_tokens': 729528370.0, 'completions/mean_length': 7313.8984375, 'completions/min_length': 617.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6867.82763671875, 'completions/min_terminated_length': 617.0, 'completions/max_terminated_length': 14298.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018777422606945038, 'sampling/sampling_logp_difference/max': 2.4803037643432617, 'sampling/importance_sampling_ratio/min': 0.08371778577566147, 'sampling/importance_sampling_ratio/mean': 1.0000720024108887, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.053148586535826e-05, 'epoch': 0.74}
+
+ 78%|███████▊  | 802/1024 [36:19:07<10:34:03, 171.37s/it][AINFO 12-02 07:50:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:23 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:50:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 78%|███████▊  | 803/1024 [36:21:46<10:17:59, 167.78s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.000810184224974364, 'learning_rate': 1e-05, 'num_tokens': 730462076.0, 'completions/mean_length': 7101.953125, 'completions/min_length': 777.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6645.45849609375, 'completions/min_terminated_length': 777.0, 'completions/max_terminated_length': 15923.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.19332444667816162, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020264681428670883, 'sampling/sampling_logp_difference/max': 2.618542432785034, 'sampling/importance_sampling_ratio/min': 0.07290905714035034, 'sampling/importance_sampling_ratio/mean': 1.0000543594360352, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.047073431616809e-05, 'epoch': 0.74}
+
+ 78%|███████▊  | 803/1024 [36:21:46<10:17:59, 167.78s/it][AINFO 12-02 07:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:53:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 79%|███████▊  | 804/1024 [36:24:30<10:10:49, 166.59s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.000751432147808373, 'learning_rate': 1e-05, 'num_tokens': 731337795.0, 'completions/mean_length': 6685.3671875, 'completions/min_length': 454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6372.5078125, 'completions/min_terminated_length': 454.0, 'completions/max_terminated_length': 15612.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.1938612163066864, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01760057732462883, 'sampling/sampling_logp_difference/max': 10.824111938476562, 'sampling/importance_sampling_ratio/min': 1.9913513824576512e-05, 'sampling/importance_sampling_ratio/mean': 1.0000591278076172, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.076674364725477e-05, 'epoch': 0.74}
+
+ 79%|███████▊  | 804/1024 [36:24:30<10:10:49, 166.59s/it][AINFO 12-02 07:55:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:55:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:55:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:55:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 79%|███████▊  | 805/1024 [36:27:08<9:59:11, 164.16s/it] [A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001991275465115905, 'learning_rate': 1e-05, 'num_tokens': 732349621.0, 'completions/mean_length': 7733.265625, 'completions/min_length': 2362.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7665.1494140625, 'completions/min_terminated_length': 2362.0, 'completions/max_terminated_length': 16078.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.24936595559120178, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02098526433110237, 'sampling/sampling_logp_difference/max': 2.1174850463867188, 'sampling/importance_sampling_ratio/min': 0.12033388018608093, 'sampling/importance_sampling_ratio/mean': 0.9999371767044067, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.930115437673521e-05, 'epoch': 0.74}
+
+ 79%|███████▊  | 805/1024 [36:27:08<9:59:11, 164.16s/it][AINFO 12-02 07:58:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:58:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:58:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 07:58:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 79%|███████▊  | 806/1024 [36:29:53<9:56:39, 164.22s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.00211785058490932, 'learning_rate': 1e-05, 'num_tokens': 733304709.0, 'completions/mean_length': 7293.4375, 'completions/min_length': 1030.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7149.14306640625, 'completions/min_terminated_length': 1030.0, 'completions/max_terminated_length': 15589.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02271895855665207, 'sampling/sampling_logp_difference/max': 7.624327659606934, 'sampling/importance_sampling_ratio/min': 0.0004884235095232725, 'sampling/importance_sampling_ratio/mean': 1.000064730644226, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.48065991349722e-05, 'epoch': 0.74}
+
+ 79%|███████▊  | 806/1024 [36:29:53<9:56:39, 164.22s/it][AINFO 12-02 08:01:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:01:09 [block_pool.py:292] Successfully reset prefix cache
+
+ 79%|███████▉  | 807/1024 [36:32:54<10:12:40, 169.40s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0022908656392246485, 'learning_rate': 1e-05, 'num_tokens': 734410879.0, 'completions/mean_length': 8508.203125, 'completions/min_length': 1198.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7983.150390625, 'completions/min_terminated_length': 1198.0, 'completions/max_terminated_length': 16382.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019088217988610268, 'sampling/sampling_logp_difference/max': 2.5162603855133057, 'sampling/importance_sampling_ratio/min': 0.08076106011867523, 'sampling/importance_sampling_ratio/mean': 1.0000392198562622, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.132740827524685e-05, 'epoch': 0.74}
+
+ 79%|███████▉  | 807/1024 [36:32:54<10:12:40, 169.40s/it][AINFO 12-02 08:04:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:04:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 79%|███████▉  | 808/1024 [36:35:44<10:10:02, 169.46s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002643751911818981, 'learning_rate': 1e-05, 'num_tokens': 735409609.0, 'completions/mean_length': 7625.453125, 'completions/min_length': 2025.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7269.41455078125, 'completions/min_terminated_length': 2025.0, 'completions/max_terminated_length': 15734.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2556639611721039, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019166799262166023, 'sampling/sampling_logp_difference/max': 2.0408592224121094, 'sampling/importance_sampling_ratio/min': 0.12991704046726227, 'sampling/importance_sampling_ratio/mean': 1.0000027418136597, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.714699343821849e-05, 'epoch': 0.74}
+
+ 79%|███████▉  | 808/1024 [36:35:44<10:10:02, 169.46s/it][AINFO 12-02 08:07:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:07:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:07:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:07:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 79%|███████▉  | 809/1024 [36:38:35<10:09:09, 170.00s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0014902573311701417, 'learning_rate': 1e-05, 'num_tokens': 736298829.0, 'completions/mean_length': 6794.15625, 'completions/min_length': 915.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6564.00048828125, 'completions/min_terminated_length': 915.0, 'completions/max_terminated_length': 15885.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3295465111732483, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01866999641060829, 'sampling/sampling_logp_difference/max': 2.1783030033111572, 'sampling/importance_sampling_ratio/min': 0.1132335215806961, 'sampling/importance_sampling_ratio/mean': 1.0000360012054443, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.608694700891647e-05, 'epoch': 0.74}
+
+ 79%|███████▉  | 809/1024 [36:38:35<10:09:09, 170.00s/it][AINFO 12-02 08:09:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:09:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 79%|███████▉  | 810/1024 [36:41:43<10:25:18, 175.32s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0005254637217149138, 'learning_rate': 1e-05, 'num_tokens': 737331130.0, 'completions/mean_length': 7912.2890625, 'completions/min_length': 1438.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7271.57177734375, 'completions/min_terminated_length': 1438.0, 'completions/max_terminated_length': 16130.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.17358636856079102, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02023245394229889, 'sampling/sampling_logp_difference/max': 3.6128268241882324, 'sampling/importance_sampling_ratio/min': 0.1256365329027176, 'sampling/importance_sampling_ratio/mean': 1.0001132488250732, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.775018114993145e-05, 'epoch': 0.75}
+
+ 79%|███████▉  | 810/1024 [36:41:43<10:25:18, 175.32s/it][AINFO 12-02 08:12:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:12:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 79%|███████▉  | 811/1024 [36:44:18<10:01:19, 169.39s/it][A
+                                                         [A{'loss': 0.0001, 'grad_norm': 0.0015723281539976597, 'learning_rate': 1e-05, 'num_tokens': 738216430.0, 'completions/mean_length': 6774.15625, 'completions/min_length': 803.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6621.61962890625, 'completions/min_terminated_length': 803.0, 'completions/max_terminated_length': 14393.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020094335079193115, 'sampling/sampling_logp_difference/max': 4.370855331420898, 'sampling/importance_sampling_ratio/min': 0.012640425004065037, 'sampling/importance_sampling_ratio/mean': 0.999991774559021, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.744910484077991e-05, 'epoch': 0.75}
+
+ 79%|███████▉  | 811/1024 [36:44:18<10:01:19, 169.39s/it][AINFO 12-02 08:15:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:15:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 79%|███████▉  | 812/1024 [36:47:08<9:59:21, 169.63s/it] [A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0014076496008783579, 'learning_rate': 1e-05, 'num_tokens': 739233439.0, 'completions/mean_length': 7781.0703125, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7431.357421875, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.1922685205936432, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020636066794395447, 'sampling/sampling_logp_difference/max': 2.410329580307007, 'sampling/importance_sampling_ratio/min': 0.08978570252656937, 'sampling/importance_sampling_ratio/mean': 0.9999973773956299, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.758078287319222e-05, 'epoch': 0.75}
+
+ 79%|███████▉  | 812/1024 [36:47:08<9:59:21, 169.63s/it][AINFO 12-02 08:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:18:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 79%|███████▉  | 813/1024 [36:50:01<9:59:16, 170.41s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0012137920130044222, 'learning_rate': 1e-05, 'num_tokens': 740095843.0, 'completions/mean_length': 6580.03125, 'completions/min_length': 791.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6424.4130859375, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 16265.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2001592367887497, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020469870418310165, 'sampling/sampling_logp_difference/max': 8.33084487915039, 'sampling/importance_sampling_ratio/min': 0.00024096836568787694, 'sampling/importance_sampling_ratio/mean': 0.9999499917030334, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.2960131673244177e-05, 'epoch': 0.75}
+
+ 79%|███████▉  | 813/1024 [36:50:01<9:59:16, 170.41s/it][AINFO 12-02 08:21:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:21:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:21:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:21:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 79%|███████▉  | 814/1024 [36:52:57<10:02:59, 172.28s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0021601328626275063, 'learning_rate': 1e-05, 'num_tokens': 741149189.0, 'completions/mean_length': 8065.328125, 'completions/min_length': 2110.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7656.212890625, 'completions/min_terminated_length': 2110.0, 'completions/max_terminated_length': 15620.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.31116873025894165, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019389986991882324, 'sampling/sampling_logp_difference/max': 2.718200206756592, 'sampling/importance_sampling_ratio/min': 0.06599342077970505, 'sampling/importance_sampling_ratio/mean': 0.9999868869781494, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.019363622224773e-05, 'epoch': 0.75}
+
+ 79%|███████▉  | 814/1024 [36:52:57<10:02:59, 172.28s/it][AINFO 12-02 08:24:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:24:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 80%|███████▉  | 815/1024 [36:55:51<10:01:55, 172.80s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0018050484359264374, 'learning_rate': 1e-05, 'num_tokens': 742117038.0, 'completions/mean_length': 7389.2578125, 'completions/min_length': 606.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7099.1044921875, 'completions/min_terminated_length': 606.0, 'completions/max_terminated_length': 15746.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2924865782260895, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02070988528430462, 'sampling/sampling_logp_difference/max': 20.261369705200195, 'sampling/importance_sampling_ratio/min': 1.5870803560247282e-09, 'sampling/importance_sampling_ratio/mean': 0.9999200701713562, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.724831678184273e-05, 'epoch': 0.75}
+
+ 80%|███████▉  | 815/1024 [36:55:51<10:01:55, 172.80s/it][AINFO 12-02 08:27:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:27:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:27:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:27:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 80%|███████▉  | 816/1024 [36:58:37<9:51:29, 170.62s/it] [A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0017399805365130305, 'learning_rate': 1e-05, 'num_tokens': 743163131.0, 'completions/mean_length': 8027.7265625, 'completions/min_length': 1487.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7470.64208984375, 'completions/min_terminated_length': 1487.0, 'completions/max_terminated_length': 14264.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2977414131164551, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020387347787618637, 'sampling/sampling_logp_difference/max': 5.879062175750732, 'sampling/importance_sampling_ratio/min': 0.002797407563775778, 'sampling/importance_sampling_ratio/mean': 0.9999685287475586, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.979448062338633e-05, 'epoch': 0.75}
+
+ 80%|███████▉  | 816/1024 [36:58:37<9:51:29, 170.62s/it][AINFO 12-02 08:29:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:29:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:29:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:29:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 80%|███████▉  | 817/1024 [37:01:16<9:36:43, 167.17s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.00266625895164907, 'learning_rate': 1e-05, 'num_tokens': 744045547.0, 'completions/mean_length': 6733.25, 'completions/min_length': 1294.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6580.06396484375, 'completions/min_terminated_length': 1294.0, 'completions/max_terminated_length': 15845.0, 'rewards/accuracy_reward/mean': 0.6328125, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.6328125, 'reward_std': 0.33850300312042236, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01928819715976715, 'sampling/sampling_logp_difference/max': 3.0133838653564453, 'sampling/importance_sampling_ratio/min': 0.10213258862495422, 'sampling/importance_sampling_ratio/mean': 0.9999620914459229, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.82815509410284e-05, 'epoch': 0.75}
+
+ 80%|███████▉  | 817/1024 [37:01:16<9:36:43, 167.17s/it][AINFO 12-02 08:32:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:32:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:32:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:32:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 80%|███████▉  | 818/1024 [37:04:03<9:34:05, 167.21s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001680571585893631, 'learning_rate': 1e-05, 'num_tokens': 744995548.0, 'completions/mean_length': 7278.0078125, 'completions/min_length': 758.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6984.26611328125, 'completions/min_terminated_length': 758.0, 'completions/max_terminated_length': 16217.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018340932205319405, 'sampling/sampling_logp_difference/max': 1.664870262145996, 'sampling/importance_sampling_ratio/min': 0.18921519815921783, 'sampling/importance_sampling_ratio/mean': 1.0000802278518677, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.102077759351232e-05, 'epoch': 0.75}
+
+ 80%|███████▉  | 818/1024 [37:04:03<9:34:05, 167.21s/it][AINFO 12-02 08:35:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:35:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:35:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:35:20 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 80%|███████▉  | 819/1024 [37:06:58<9:38:40, 169.37s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001999449450522661, 'learning_rate': 1e-05, 'num_tokens': 745916608.0, 'completions/mean_length': 7027.40625, 'completions/min_length': 1067.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6953.732421875, 'completions/min_terminated_length': 1067.0, 'completions/max_terminated_length': 16119.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021387305110692978, 'sampling/sampling_logp_difference/max': 2.9631943702697754, 'sampling/importance_sampling_ratio/min': 0.051653649657964706, 'sampling/importance_sampling_ratio/mean': 1.0000839233398438, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0830878586129984e-05, 'epoch': 0.75}
+
+ 80%|███████▉  | 819/1024 [37:06:58<9:38:40, 169.37s/it][AINFO 12-02 08:38:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:38:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:38:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:38:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 80%|████████  | 820/1024 [37:09:39<9:27:50, 167.01s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002130510052666068, 'learning_rate': 1e-05, 'num_tokens': 746790102.0, 'completions/mean_length': 6666.484375, 'completions/min_length': 1166.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6512.23828125, 'completions/min_terminated_length': 1166.0, 'completions/max_terminated_length': 15386.0, 'rewards/accuracy_reward/mean': 0.6640625, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.6640625, 'reward_std': 0.14465448260307312, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.01852484792470932, 'sampling/sampling_logp_difference/max': 1.7045992612838745, 'sampling/importance_sampling_ratio/min': 0.18184524774551392, 'sampling/importance_sampling_ratio/mean': 0.9999904036521912, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.705602662445017e-05, 'epoch': 0.75}
+
+ 80%|████████  | 820/1024 [37:09:39<9:27:50, 167.01s/it][AINFO 12-02 08:40:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:40:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:40:56 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:40:56 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 80%|████████  | 821/1024 [37:12:21<9:20:03, 165.53s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017849340802058578, 'learning_rate': 1e-05, 'num_tokens': 747732698.0, 'completions/mean_length': 7234.03125, 'completions/min_length': 1217.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7161.984375, 'completions/min_terminated_length': 1217.0, 'completions/max_terminated_length': 15954.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3435155153274536, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01989883929491043, 'sampling/sampling_logp_difference/max': 2.013209581375122, 'sampling/importance_sampling_ratio/min': 0.13355931639671326, 'sampling/importance_sampling_ratio/mean': 0.9999781847000122, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.313735493928107e-05, 'epoch': 0.76}
+
+ 80%|████████  | 821/1024 [37:12:21<9:20:03, 165.53s/it][AINFO 12-02 08:43:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:43:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:43:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:43:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 80%|████████  | 822/1024 [37:15:23<9:33:20, 170.30s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002164609497413039, 'learning_rate': 1e-05, 'num_tokens': 748816232.0, 'completions/mean_length': 8294.671875, 'completions/min_length': 1551.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7755.3837890625, 'completions/min_terminated_length': 1551.0, 'completions/max_terminated_length': 15893.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.29432153701782227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01945515349507332, 'sampling/sampling_logp_difference/max': 2.682100534439087, 'sampling/importance_sampling_ratio/min': 0.06841928511857986, 'sampling/importance_sampling_ratio/mean': 0.9999738335609436, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.64018766656227e-05, 'epoch': 0.76}
+
+ 80%|████████  | 822/1024 [37:15:23<9:33:20, 170.30s/it][AINFO 12-02 08:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:46:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 80%|████████  | 823/1024 [37:18:25<9:42:49, 173.98s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0012596385786309838, 'learning_rate': 1e-05, 'num_tokens': 749791557.0, 'completions/mean_length': 7470.6640625, 'completions/min_length': 1285.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7183.13671875, 'completions/min_terminated_length': 1285.0, 'completions/max_terminated_length': 16247.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2767001986503601, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019131433218717575, 'sampling/sampling_logp_difference/max': 3.9791970252990723, 'sampling/importance_sampling_ratio/min': 0.018700649961829185, 'sampling/importance_sampling_ratio/mean': 1.0000030994415283, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.733723264711443e-05, 'epoch': 0.76}
+
+ 80%|████████  | 823/1024 [37:18:25<9:42:49, 173.98s/it][AINFO 12-02 08:49:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:49:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:49:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:49:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 80%|████████  | 824/1024 [37:21:51<10:11:46, 183.53s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.002169548301026225, 'learning_rate': 1e-05, 'num_tokens': 751060169.0, 'completions/mean_length': 9761.78125, 'completions/min_length': 1007.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 9139.1796875, 'completions/min_terminated_length': 1007.0, 'completions/max_terminated_length': 16020.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2964382767677307, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020143693313002586, 'sampling/sampling_logp_difference/max': 4.361732482910156, 'sampling/importance_sampling_ratio/min': 0.012756268493831158, 'sampling/importance_sampling_ratio/mean': 1.0000452995300293, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.994015504664276e-05, 'epoch': 0.76}
+
+ 80%|████████  | 824/1024 [37:21:51<10:11:46, 183.53s/it][AINFO 12-02 08:53:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:53:08 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 81%|████████  | 825/1024 [37:24:57<10:11:21, 184.33s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0015827392926439643, 'learning_rate': 1e-05, 'num_tokens': 752047463.0, 'completions/mean_length': 7550.046875, 'completions/min_length': 1273.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7115.58984375, 'completions/min_terminated_length': 1273.0, 'completions/max_terminated_length': 16349.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.24541422724723816, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020318055525422096, 'sampling/sampling_logp_difference/max': 2.7608137130737305, 'sampling/importance_sampling_ratio/min': 0.06324028223752975, 'sampling/importance_sampling_ratio/mean': 0.9999629855155945, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.923346546092944e-05, 'epoch': 0.76}
+
+ 81%|████████  | 825/1024 [37:24:57<10:11:21, 184.33s/it][AINFO 12-02 08:56:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:56:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:56:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:56:14 [block_pool.py:292] Successfully reset prefix cache
+
+ 81%|████████  | 826/1024 [37:28:01<10:07:24, 184.06s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0010461772326380014, 'learning_rate': 1e-05, 'num_tokens': 753172815.0, 'completions/mean_length': 8636.5625, 'completions/min_length': 1659.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8386.64453125, 'completions/min_terminated_length': 1659.0, 'completions/max_terminated_length': 15814.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.22673210501670837, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02148914709687233, 'sampling/sampling_logp_difference/max': 3.185965061187744, 'sampling/importance_sampling_ratio/min': 0.04133833199739456, 'sampling/importance_sampling_ratio/mean': 1.000035285949707, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.4870734427604475e-05, 'epoch': 0.76}
+
+ 81%|████████  | 826/1024 [37:28:01<10:07:24, 184.06s/it][AINFO 12-02 08:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 08:59:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 81%|████████  | 827/1024 [37:31:04<10:03:54, 183.93s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.0020099047105759382, 'learning_rate': 1e-05, 'num_tokens': 754090789.0, 'completions/mean_length': 6997.171875, 'completions/min_length': 882.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6454.1318359375, 'completions/min_terminated_length': 882.0, 'completions/max_terminated_length': 16094.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.22461041808128357, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01756388321518898, 'sampling/sampling_logp_difference/max': 2.1259448528289795, 'sampling/importance_sampling_ratio/min': 0.11932017654180527, 'sampling/importance_sampling_ratio/mean': 0.999937117099762, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.729950174147234e-05, 'epoch': 0.76}
+
+ 81%|████████  | 827/1024 [37:31:04<10:03:54, 183.93s/it][AINFO 12-02 09:02:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:02:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:02:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:02:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 81%|████████  | 828/1024 [37:34:15<10:07:39, 186.02s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.001179098035208881, 'learning_rate': 1e-05, 'num_tokens': 755262799.0, 'completions/mean_length': 8930.390625, 'completions/min_length': 1356.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8440.5, 'completions/min_terminated_length': 1356.0, 'completions/max_terminated_length': 15756.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.21778056025505066, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02159072458744049, 'sampling/sampling_logp_difference/max': 11.629781723022461, 'sampling/importance_sampling_ratio/min': 8.897130101104267e-06, 'sampling/importance_sampling_ratio/mean': 0.9999356865882874, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1768590790525195e-05, 'epoch': 0.76}
+
+ 81%|████████  | 828/1024 [37:34:15<10:07:39, 186.02s/it][AINFO 12-02 09:05:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:05:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:05:32 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:05:32 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+[OpenTinker] 2025-12-02 09:07:39,387 - math_verify.grader - WARNING - Timeout during comparison
+
+ 81%|████████  | 829/1024 [37:37:28<10:10:48, 187.94s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0009136886801570654, 'learning_rate': 1e-05, 'num_tokens': 756402891.0, 'completions/mean_length': 8757.03125, 'completions/min_length': 1617.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8248.5673828125, 'completions/min_terminated_length': 1617.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.27670514583587646, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020860493183135986, 'sampling/sampling_logp_difference/max': 4.195775508880615, 'sampling/importance_sampling_ratio/min': 0.015059059485793114, 'sampling/importance_sampling_ratio/mean': 1.0000464916229248, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6574745428861206e-05, 'epoch': 0.76}
+
+ 81%|████████  | 829/1024 [37:37:28<10:10:48, 187.94s/it][AINFO 12-02 09:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:08:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:08:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 81%|████████  | 830/1024 [37:40:27<9:59:44, 185.49s/it] [A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0023916885256767273, 'learning_rate': 1e-05, 'num_tokens': 757498858.0, 'completions/mean_length': 8417.9296875, 'completions/min_length': 1520.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7886.85888671875, 'completions/min_terminated_length': 1520.0, 'completions/max_terminated_length': 15913.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.30221718549728394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01949945278465748, 'sampling/sampling_logp_difference/max': 6.685240268707275, 'sampling/importance_sampling_ratio/min': 0.0012492146342992783, 'sampling/importance_sampling_ratio/mean': 0.9999895095825195, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.715066299671889e-05, 'epoch': 0.76}
+
+ 81%|████████  | 830/1024 [37:40:27<9:59:44, 185.49s/it][AINFO 12-02 09:11:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:11:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:11:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:11:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 81%|████████  | 831/1024 [37:43:33<9:57:04, 185.62s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017350903945043683, 'learning_rate': 1e-05, 'num_tokens': 758523548.0, 'completions/mean_length': 7867.328125, 'completions/min_length': 1135.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7299.55029296875, 'completions/min_terminated_length': 1135.0, 'completions/max_terminated_length': 16311.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.20175684988498688, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019753914326429367, 'sampling/sampling_logp_difference/max': 1.7782506942749023, 'sampling/importance_sampling_ratio/min': 0.16893340647220612, 'sampling/importance_sampling_ratio/mean': 0.9999570846557617, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.501902117226564e-05, 'epoch': 0.76}
+
+ 81%|████████  | 831/1024 [37:43:33<9:57:04, 185.62s/it][AINFO 12-02 09:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:14:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 81%|████████▏ | 832/1024 [37:46:24<9:40:03, 181.27s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016087087569758296, 'learning_rate': 1e-05, 'num_tokens': 759600720.0, 'completions/mean_length': 8261.65625, 'completions/min_length': 1658.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7931.4794921875, 'completions/min_terminated_length': 1658.0, 'completions/max_terminated_length': 16111.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.23410366475582123, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019564341753721237, 'sampling/sampling_logp_difference/max': 6.611324310302734, 'sampling/importance_sampling_ratio/min': 0.001345049706287682, 'sampling/importance_sampling_ratio/mean': 1.0000338554382324, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.993191052766633e-05, 'epoch': 0.77}
+
+ 81%|████████▏ | 832/1024 [37:46:24<9:40:03, 181.27s/it][AINFO 12-02 09:17:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:17:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:17:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:17:44 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 81%|████████▏ | 833/1024 [37:49:29<9:40:32, 182.37s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0021732517052441835, 'learning_rate': 1e-05, 'num_tokens': 760622026.0, 'completions/mean_length': 7827.640625, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7691.82568359375, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15977.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02031402289867401, 'sampling/sampling_logp_difference/max': 3.184709072113037, 'sampling/importance_sampling_ratio/min': 0.04139028489589691, 'sampling/importance_sampling_ratio/mean': 0.9999998807907104, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.2085607390163204e-05, 'epoch': 0.77}
+
+ 81%|████████▏ | 833/1024 [37:49:29<9:40:32, 182.37s/it][AINFO 12-02 09:20:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:20:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:20:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:20:46 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 81%|████████▏ | 834/1024 [37:52:41<9:46:30, 185.21s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0027929043862968683, 'learning_rate': 1e-05, 'num_tokens': 761830209.0, 'completions/mean_length': 9275.2421875, 'completions/min_length': 1377.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1171875, 'completions/mean_terminated_length': 8331.6015625, 'completions/min_terminated_length': 1377.0, 'completions/max_terminated_length': 16053.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.3106446862220764, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021344974637031555, 'sampling/sampling_logp_difference/max': 6.4792866706848145, 'sampling/importance_sampling_ratio/min': 0.0015349051682278514, 'sampling/importance_sampling_ratio/mean': 1.0000190734863281, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.459921195935749e-05, 'epoch': 0.77}
+
+ 81%|████████▏ | 834/1024 [37:52:41<9:46:30, 185.21s/it][AINFO 12-02 09:23:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:23:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:23:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:23:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 82%|████████▏ | 835/1024 [37:55:45<9:41:43, 184.68s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016401964239776134, 'learning_rate': 1e-05, 'num_tokens': 762816078.0, 'completions/mean_length': 7534.7265625, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7174.99951171875, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 16377.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.2369818389415741, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020450875163078308, 'sampling/sampling_logp_difference/max': 2.153107166290283, 'sampling/importance_sampling_ratio/min': 0.1161227822303772, 'sampling/importance_sampling_ratio/mean': 0.9999833106994629, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.3510829729930265e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 835/1024 [37:55:45<9:41:43, 184.68s/it][AINFO 12-02 09:27:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:27:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:27:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:27:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 82%|████████▏ | 836/1024 [37:58:28<9:18:49, 178.35s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0038436860777437687, 'learning_rate': 1e-05, 'num_tokens': 763757480.0, 'completions/mean_length': 7196.203125, 'completions/min_length': 1251.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6899.822265625, 'completions/min_terminated_length': 1251.0, 'completions/max_terminated_length': 15899.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.3135228157043457, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01922577992081642, 'sampling/sampling_logp_difference/max': 1.8976764678955078, 'sampling/importance_sampling_ratio/min': 0.14991655945777893, 'sampling/importance_sampling_ratio/mean': 0.9998571872711182, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.680614867742406e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 836/1024 [37:58:28<9:18:49, 178.35s/it][AINFO 12-02 09:29:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:29:45 [block_pool.py:292] Successfully reset prefix cache
+
+ 82%|████████▏ | 837/1024 [38:01:04<8:55:10, 171.72s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001400959794409573, 'learning_rate': 1e-05, 'num_tokens': 764665625.0, 'completions/mean_length': 6939.1328125, 'completions/min_length': 1229.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6634.45947265625, 'completions/min_terminated_length': 1229.0, 'completions/max_terminated_length': 16234.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020845940336585045, 'sampling/sampling_logp_difference/max': 5.005302906036377, 'sampling/importance_sampling_ratio/min': 0.006702310871332884, 'sampling/importance_sampling_ratio/mean': 1.0000128746032715, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.662387459575257e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 837/1024 [38:01:04<8:55:10, 171.72s/it][AINFO 12-02 09:32:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:32:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:32:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:32:21 [block_pool.py:292] Successfully reset prefix cache
+
+ 82%|████████▏ | 838/1024 [38:04:14<9:08:57, 177.08s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0012298723449930549, 'learning_rate': 1e-05, 'num_tokens': 765627020.0, 'completions/mean_length': 7371.2109375, 'completions/min_length': 1167.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7080.4755859375, 'completions/min_terminated_length': 1167.0, 'completions/max_terminated_length': 15949.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01971576176583767, 'sampling/sampling_logp_difference/max': 6.020603656768799, 'sampling/importance_sampling_ratio/min': 0.0024282033555209637, 'sampling/importance_sampling_ratio/mean': 0.9999961853027344, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.170575968804769e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 838/1024 [38:04:14<9:08:57, 177.08s/it][AINFO 12-02 09:35:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:35:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:35:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:35:31 [block_pool.py:292] Successfully reset prefix cache
+
+ 82%|████████▏ | 839/1024 [38:06:42<8:39:05, 168.35s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0027633332647383213, 'learning_rate': 1e-05, 'num_tokens': 766569875.0, 'completions/mean_length': 7164.2421875, 'completions/min_length': 1373.0, 'completions/max_length': 14592.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7164.2421875, 'completions/min_terminated_length': 1373.0, 'completions/max_terminated_length': 14592.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.26037219166755676, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022373199462890625, 'sampling/sampling_logp_difference/max': 2.476754665374756, 'sampling/importance_sampling_ratio/min': 0.08401544392108917, 'sampling/importance_sampling_ratio/mean': 0.9999051094055176, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.333341078359808e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 839/1024 [38:06:42<8:39:05, 168.35s/it][AINFO 12-02 09:37:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:37:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 82%|████████▏ | 840/1024 [38:09:40<8:45:16, 171.29s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001728714327327907, 'learning_rate': 1e-05, 'num_tokens': 767584329.0, 'completions/mean_length': 7773.546875, 'completions/min_length': 722.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7199.51708984375, 'completions/min_terminated_length': 722.0, 'completions/max_terminated_length': 16004.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.22621294856071472, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01996704936027527, 'sampling/sampling_logp_difference/max': 4.702703475952148, 'sampling/importance_sampling_ratio/min': 0.009070721454918385, 'sampling/importance_sampling_ratio/mean': 1.000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.231572986805986e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 840/1024 [38:09:40<8:45:16, 171.29s/it][AINFO 12-02 09:40:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:40:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 82%|████████▏ | 841/1024 [38:12:45<8:54:34, 175.27s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0022114792373031378, 'learning_rate': 1e-05, 'num_tokens': 768705363.0, 'completions/mean_length': 8598.515625, 'completions/min_length': 1328.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 8411.6640625, 'completions/min_terminated_length': 1328.0, 'completions/max_terminated_length': 16274.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.23250605165958405, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02032552659511566, 'sampling/sampling_logp_difference/max': 2.2979352474212646, 'sampling/importance_sampling_ratio/min': 0.10046606510877609, 'sampling/importance_sampling_ratio/mean': 0.9999816417694092, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.0042122918748646e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 841/1024 [38:12:45<8:54:34, 175.27s/it][AINFO 12-02 09:44:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:44:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:44:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:44:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 82%|████████▏ | 842/1024 [38:15:33<8:45:16, 173.17s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018260134384036064, 'learning_rate': 1e-05, 'num_tokens': 769597260.0, 'completions/mean_length': 6812.2578125, 'completions/min_length': 1021.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6660.32568359375, 'completions/min_terminated_length': 1021.0, 'completions/max_terminated_length': 15402.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.34982627630233765, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01975705660879612, 'sampling/sampling_logp_difference/max': 1.2408480644226074, 'sampling/importance_sampling_ratio/min': 0.28913891315460205, 'sampling/importance_sampling_ratio/mean': 1.0000560283660889, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.739799139017123e-05, 'epoch': 0.77}
+
+ 82%|████████▏ | 842/1024 [38:15:33<8:45:16, 173.17s/it][AINFO 12-02 09:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:46:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:46:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 82%|████████▏ | 843/1024 [38:18:43<8:57:21, 178.13s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0022801088634878397, 'learning_rate': 1e-05, 'num_tokens': 770702165.0, 'completions/mean_length': 8475.6328125, 'completions/min_length': 937.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8086.6962890625, 'completions/min_terminated_length': 937.0, 'completions/max_terminated_length': 16341.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3072297275066376, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020036811009049416, 'sampling/sampling_logp_difference/max': 3.6229248046875, 'sampling/importance_sampling_ratio/min': 0.02670445665717125, 'sampling/importance_sampling_ratio/mean': 0.9999628663063049, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.789643341813644e-05, 'epoch': 0.78}
+
+ 82%|████████▏ | 843/1024 [38:18:43<8:57:21, 178.13s/it][AINFO 12-02 09:49:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:49:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:49:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:49:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 82%|████████▏ | 844/1024 [38:21:37<8:51:02, 177.01s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0020590219646692276, 'learning_rate': 1e-05, 'num_tokens': 771692495.0, 'completions/mean_length': 7550.015625, 'completions/min_length': 1275.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7265.04833984375, 'completions/min_terminated_length': 1275.0, 'completions/max_terminated_length': 16216.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.22908619046211243, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019333798438310623, 'sampling/sampling_logp_difference/max': 2.346835136413574, 'sampling/importance_sampling_ratio/min': 0.09567146748304367, 'sampling/importance_sampling_ratio/mean': 1.000032901763916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0561485573343816e-05, 'epoch': 0.78}
+
+ 82%|████████▏ | 844/1024 [38:21:37<8:51:02, 177.01s/it][AINFO 12-02 09:52:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:52:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:52:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:52:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 83%|████████▎ | 845/1024 [38:24:35<8:49:09, 177.37s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0021083143074065447, 'learning_rate': 1e-05, 'num_tokens': 772636131.0, 'completions/mean_length': 7200.15625, 'completions/min_length': 767.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6826.8291015625, 'completions/min_terminated_length': 767.0, 'completions/max_terminated_length': 16326.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019747812300920486, 'sampling/sampling_logp_difference/max': 3.272357225418091, 'sampling/importance_sampling_ratio/min': 0.037916943430900574, 'sampling/importance_sampling_ratio/mean': 1.000058889389038, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.179049180194852e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 845/1024 [38:24:35<8:49:09, 177.37s/it][AINFO 12-02 09:55:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:55:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:55:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:55:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 83%|████████▎ | 846/1024 [38:27:38<8:50:59, 178.99s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0016583296237513423, 'learning_rate': 1e-05, 'num_tokens': 773727452.0, 'completions/mean_length': 8374.1328125, 'completions/min_length': 1126.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7695.33056640625, 'completions/min_terminated_length': 1126.0, 'completions/max_terminated_length': 16221.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2630355656147003, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019787868484854698, 'sampling/sampling_logp_difference/max': 8.716691017150879, 'sampling/importance_sampling_ratio/min': 0.00016382840112783015, 'sampling/importance_sampling_ratio/mean': 0.9999996423721313, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6981816379011434e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 846/1024 [38:27:38<8:50:59, 178.99s/it][AINFO 12-02 09:58:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:58:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:58:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 09:58:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 83%|████████▎ | 847/1024 [38:30:38<8:49:07, 179.36s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0020049060694873333, 'learning_rate': 1e-05, 'num_tokens': 774935034.0, 'completions/mean_length': 9235.984375, 'completions/min_length': 2064.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 8630.220703125, 'completions/min_terminated_length': 2064.0, 'completions/max_terminated_length': 16163.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.28223684430122375, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020020615309476852, 'sampling/sampling_logp_difference/max': 4.238381862640381, 'sampling/importance_sampling_ratio/min': 0.014430925250053406, 'sampling/importance_sampling_ratio/mean': 1.000014066696167, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.639867160629365e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 847/1024 [38:30:38<8:49:07, 179.36s/it][AINFO 12-02 10:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:01:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 83%|████████▎ | 848/1024 [38:33:24<8:34:22, 175.35s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0015544194029644132, 'learning_rate': 1e-05, 'num_tokens': 775895528.0, 'completions/mean_length': 7347.671875, 'completions/min_length': 548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7130.80029296875, 'completions/min_terminated_length': 548.0, 'completions/max_terminated_length': 15377.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.19438526034355164, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021253444254398346, 'sampling/sampling_logp_difference/max': 2.2227697372436523, 'sampling/importance_sampling_ratio/min': 0.10830871015787125, 'sampling/importance_sampling_ratio/mean': 0.9999622106552124, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.488279687895556e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 848/1024 [38:33:24<8:34:22, 175.35s/it][AINFO 12-02 10:04:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:04:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 83%|████████▎ | 849/1024 [38:36:14<8:26:04, 173.51s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002275155857205391, 'learning_rate': 1e-05, 'num_tokens': 776835084.0, 'completions/mean_length': 7189.65625, 'completions/min_length': 1774.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6968.9921875, 'completions/min_terminated_length': 1774.0, 'completions/max_terminated_length': 16330.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.31748437881469727, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02002628520131111, 'sampling/sampling_logp_difference/max': 1.960306167602539, 'sampling/importance_sampling_ratio/min': 0.14081530272960663, 'sampling/importance_sampling_ratio/mean': 1.0000438690185547, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.381332612661936e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 849/1024 [38:36:14<8:26:04, 173.51s/it][AINFO 12-02 10:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:07:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 83%|████████▎ | 850/1024 [38:39:23<8:36:49, 178.21s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.003266304498538375, 'learning_rate': 1e-05, 'num_tokens': 777938547.0, 'completions/mean_length': 8475.3046875, 'completions/min_length': 1272.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7877.16845703125, 'completions/min_terminated_length': 1272.0, 'completions/max_terminated_length': 16331.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3174794614315033, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02101711742579937, 'sampling/sampling_logp_difference/max': 2.40163516998291, 'sampling/importance_sampling_ratio/min': 0.09056973457336426, 'sampling/importance_sampling_ratio/mean': 1.0000240802764893, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.823992064004415e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 850/1024 [38:39:23<8:36:49, 178.21s/it][AINFO 12-02 10:10:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:10:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:10:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:10:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 83%|████████▎ | 851/1024 [38:42:22<8:34:59, 178.61s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.00161365803796798, 'learning_rate': 1e-05, 'num_tokens': 778926557.0, 'completions/mean_length': 7556.703125, 'completions/min_length': 1381.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7046.03271484375, 'completions/min_terminated_length': 1381.0, 'completions/max_terminated_length': 16028.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.20411096513271332, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018330518156290054, 'sampling/sampling_logp_difference/max': 2.5300955772399902, 'sampling/importance_sampling_ratio/min': 0.07965140789747238, 'sampling/importance_sampling_ratio/mean': 0.99998939037323, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.961379172778834e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 851/1024 [38:42:22<8:34:59, 178.61s/it][AINFO 12-02 10:13:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:13:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:13:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:13:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 83%|████████▎ | 852/1024 [38:45:26<8:36:07, 180.04s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002390077570453286, 'learning_rate': 1e-05, 'num_tokens': 779840106.0, 'completions/mean_length': 6961.3515625, 'completions/min_length': 1014.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6578.31689453125, 'completions/min_terminated_length': 1014.0, 'completions/max_terminated_length': 16206.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29719969630241394, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01974567398428917, 'sampling/sampling_logp_difference/max': 1.6435396671295166, 'sampling/importance_sampling_ratio/min': 0.1944049447774887, 'sampling/importance_sampling_ratio/mean': 1.0000224113464355, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.881347735543386e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 852/1024 [38:45:26<8:36:07, 180.04s/it][AINFO 12-02 10:16:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:16:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:16:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:16:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 83%|████████▎ | 853/1024 [38:48:31<8:37:24, 181.54s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0022213333286345005, 'learning_rate': 1e-05, 'num_tokens': 781017258.0, 'completions/mean_length': 9037.5625, 'completions/min_length': 1025.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 8346.8720703125, 'completions/min_terminated_length': 1025.0, 'completions/max_terminated_length': 16000.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2469991147518158, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019398625940084457, 'sampling/sampling_logp_difference/max': 2.723416566848755, 'sampling/importance_sampling_ratio/min': 0.065650075674057, 'sampling/importance_sampling_ratio/mean': 1.0000066757202148, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.742091283560512e-05, 'epoch': 0.78}
+
+ 83%|████████▎ | 853/1024 [38:48:31<8:37:24, 181.54s/it][AINFO 12-02 10:19:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:19:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:19:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:19:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 83%|████████▎ | 854/1024 [38:51:29<8:31:57, 180.69s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0020788784604519606, 'learning_rate': 1e-05, 'num_tokens': 781954881.0, 'completions/mean_length': 7185.8671875, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6406.3642578125, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 15779.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.21542644500732422, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018667232245206833, 'sampling/sampling_logp_difference/max': 5.910137176513672, 'sampling/importance_sampling_ratio/min': 0.0027118148282170296, 'sampling/importance_sampling_ratio/mean': 0.9999613761901855, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9321135343707283e-05, 'epoch': 0.79}
+
+ 83%|████████▎ | 854/1024 [38:51:29<8:31:57, 180.69s/it][AINFO 12-02 10:22:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:22:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:22:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:22:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 83%|████████▎ | 855/1024 [38:54:22<8:22:15, 178.31s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0018167541129514575, 'learning_rate': 1e-05, 'num_tokens': 782817196.0, 'completions/mean_length': 6574.1484375, 'completions/min_length': 1103.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6257.70166015625, 'completions/min_terminated_length': 1103.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.33327072858810425, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01832202449440956, 'sampling/sampling_logp_difference/max': 1.6353838443756104, 'sampling/importance_sampling_ratio/min': 0.19487755000591278, 'sampling/importance_sampling_ratio/mean': 0.9999975562095642, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.669704594562063e-05, 'epoch': 0.79}
+
+ 83%|████████▎ | 855/1024 [38:54:22<8:22:15, 178.31s/it][AINFO 12-02 10:25:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:25:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:25:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:25:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▎ | 856/1024 [38:57:16<8:15:34, 176.99s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0011294299038127065, 'learning_rate': 1e-05, 'num_tokens': 783868986.0, 'completions/mean_length': 8063.171875, 'completions/min_length': 1156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7794.7578125, 'completions/min_terminated_length': 1156.0, 'completions/max_terminated_length': 15875.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.1922685205936432, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021533772349357605, 'sampling/sampling_logp_difference/max': 4.7146687507629395, 'sampling/importance_sampling_ratio/min': 0.008962835185229778, 'sampling/importance_sampling_ratio/mean': 0.9999197125434875, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.5775112678493315e-05, 'epoch': 0.79}
+
+ 84%|████████▎ | 856/1024 [38:57:16<8:15:34, 176.99s/it][AINFO 12-02 10:28:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:28:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:28:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:28:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▎ | 857/1024 [39:00:19<8:17:55, 178.89s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0012968285009264946, 'learning_rate': 1e-05, 'num_tokens': 784917179.0, 'completions/mean_length': 8036.0078125, 'completions/min_length': 2154.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7172.42236328125, 'completions/min_terminated_length': 2154.0, 'completions/max_terminated_length': 16025.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018743857741355896, 'sampling/sampling_logp_difference/max': 5.624881744384766, 'sampling/importance_sampling_ratio/min': 0.00360698951408267, 'sampling/importance_sampling_ratio/mean': 0.9999432563781738, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.823750853371166e-05, 'epoch': 0.79}
+
+ 84%|████████▎ | 857/1024 [39:00:19<8:17:55, 178.89s/it][AINFO 12-02 10:31:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:31:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:31:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:31:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▍ | 858/1024 [39:03:44<8:36:36, 186.73s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0012788564199581742, 'learning_rate': 1e-05, 'num_tokens': 786142277.0, 'completions/mean_length': 9399.890625, 'completions/min_length': 1691.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 8743.265625, 'completions/min_terminated_length': 1691.0, 'completions/max_terminated_length': 16258.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020173542201519012, 'sampling/sampling_logp_difference/max': 5.491243839263916, 'sampling/importance_sampling_ratio/min': 0.00412271311506629, 'sampling/importance_sampling_ratio/mean': 1.000032901763916, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.397260554218519e-05, 'epoch': 0.79}
+
+ 84%|████████▍ | 858/1024 [39:03:44<8:36:36, 186.73s/it][AINFO 12-02 10:35:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:35:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:35:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:35:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 84%|████████▍ | 859/1024 [39:06:26<8:12:52, 179.23s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001686324831098318, 'learning_rate': 1e-05, 'num_tokens': 787069294.0, 'completions/mean_length': 7073.0703125, 'completions/min_length': 515.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6615.1552734375, 'completions/min_terminated_length': 515.0, 'completions/max_terminated_length': 15315.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2751026153564453, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019357485696673393, 'sampling/sampling_logp_difference/max': 4.090906620025635, 'sampling/importance_sampling_ratio/min': 0.016724063083529472, 'sampling/importance_sampling_ratio/mean': 0.9999649524688721, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5121235416445415e-05, 'epoch': 0.79}
+
+ 84%|████████▍ | 859/1024 [39:06:26<8:12:52, 179.23s/it][AINFO 12-02 10:37:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:37:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:37:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:37:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▍ | 860/1024 [39:08:56<7:45:33, 170.32s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.003065646393224597, 'learning_rate': 1e-05, 'num_tokens': 787859610.0, 'completions/mean_length': 6026.34375, 'completions/min_length': 861.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5944.78759765625, 'completions/min_terminated_length': 861.0, 'completions/max_terminated_length': 15147.0, 'rewards/accuracy_reward/mean': 0.6640625, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.6640625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018187616020441055, 'sampling/sampling_logp_difference/max': 4.061704635620117, 'sampling/importance_sampling_ratio/min': 0.01721964031457901, 'sampling/importance_sampling_ratio/mean': 1.000065803527832, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.551551342046878e-05, 'epoch': 0.79}
+
+ 84%|████████▍ | 860/1024 [39:08:56<7:45:33, 170.32s/it][AINFO 12-02 10:40:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:40:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:40:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:40:12 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 84%|████████▍ | 861/1024 [39:11:59<7:53:01, 174.12s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002321940613910556, 'learning_rate': 1e-05, 'num_tokens': 788958107.0, 'completions/mean_length': 8430.5078125, 'completions/min_length': 1095.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7756.48291015625, 'completions/min_terminated_length': 1095.0, 'completions/max_terminated_length': 16211.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.20175684988498688, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02053608000278473, 'sampling/sampling_logp_difference/max': 2.113706350326538, 'sampling/importance_sampling_ratio/min': 0.12078944593667984, 'sampling/importance_sampling_ratio/mean': 0.9999955892562866, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.956809143801365e-05, 'epoch': 0.79}
+
+ 84%|████████▍ | 861/1024 [39:11:59<7:53:01, 174.12s/it][AINFO 12-02 10:43:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:43:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:43:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:43:15 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▍ | 862/1024 [39:14:46<7:44:23, 172.00s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017055675853043795, 'learning_rate': 1e-05, 'num_tokens': 789907923.0, 'completions/mean_length': 7244.625, 'completions/min_length': 1081.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7099.55615234375, 'completions/min_terminated_length': 1081.0, 'completions/max_terminated_length': 14898.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2177756428718567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.023493703454732895, 'sampling/sampling_logp_difference/max': 14.477801322937012, 'sampling/importance_sampling_ratio/min': 5.156687166163465e-07, 'sampling/importance_sampling_ratio/mean': 0.9998587965965271, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.374810341938428e-05, 'epoch': 0.79}
+
+ 84%|████████▍ | 862/1024 [39:14:46<7:44:23, 172.00s/it][AINFO 12-02 10:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:46:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:46:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▍ | 863/1024 [39:17:49<7:50:14, 175.25s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002711524721235037, 'learning_rate': 1e-05, 'num_tokens': 790889112.0, 'completions/mean_length': 7532.6015625, 'completions/min_length': 776.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6782.48291015625, 'completions/min_terminated_length': 776.0, 'completions/max_terminated_length': 16382.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3372175395488739, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019216477870941162, 'sampling/sampling_logp_difference/max': 8.540743827819824, 'sampling/importance_sampling_ratio/min': 0.00019534491002559662, 'sampling/importance_sampling_ratio/mean': 0.99997478723526, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.476442467828747e-05, 'epoch': 0.79}
+
+ 84%|████████▍ | 863/1024 [39:17:49<7:50:14, 175.25s/it][AINFO 12-02 10:49:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:49:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:49:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:49:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▍ | 864/1024 [39:20:55<7:55:54, 178.47s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001407406758517027, 'learning_rate': 1e-05, 'num_tokens': 791833960.0, 'completions/mean_length': 7231.0625, 'completions/min_length': 806.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6370.5302734375, 'completions/min_terminated_length': 806.0, 'completions/max_terminated_length': 16115.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.23857943713665009, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019311869516968727, 'sampling/sampling_logp_difference/max': 3.2482759952545166, 'sampling/importance_sampling_ratio/min': 0.038841113448143005, 'sampling/importance_sampling_ratio/mean': 1.0000053644180298, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.587035255099181e-05, 'epoch': 0.79}
+
+ 84%|████████▍ | 864/1024 [39:20:55<7:55:54, 178.47s/it][AINFO 12-02 10:52:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:52:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:52:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:52:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 84%|████████▍ | 865/1024 [39:23:51<7:51:41, 178.00s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002966703614220023, 'learning_rate': 1e-05, 'num_tokens': 792965431.0, 'completions/mean_length': 8702.6171875, 'completions/min_length': 1184.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8390.365234375, 'completions/min_terminated_length': 1184.0, 'completions/max_terminated_length': 15746.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.24777325987815857, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022610265761613846, 'sampling/sampling_logp_difference/max': 1.5977709293365479, 'sampling/importance_sampling_ratio/min': 0.23458577692508698, 'sampling/importance_sampling_ratio/mean': 1.0000499486923218, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.605102756409906e-05, 'epoch': 0.8}
+
+ 84%|████████▍ | 865/1024 [39:23:51<7:51:41, 178.00s/it][AINFO 12-02 10:55:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:55:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 85%|████████▍ | 866/1024 [39:26:46<7:45:58, 176.95s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0010815791320055723, 'learning_rate': 1e-05, 'num_tokens': 793903716.0, 'completions/mean_length': 7192.6015625, 'completions/min_length': 847.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6896.1044921875, 'completions/min_terminated_length': 847.0, 'completions/max_terminated_length': 15267.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.28749164938926697, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0194624625146389, 'sampling/sampling_logp_difference/max': 2.0913991928100586, 'sampling/importance_sampling_ratio/min': 0.2639823853969574, 'sampling/importance_sampling_ratio/mean': 0.9999594688415527, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.922031166643137e-05, 'epoch': 0.8}
+
+ 85%|████████▍ | 866/1024 [39:26:46<7:45:58, 176.95s/it][AINFO 12-02 10:58:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:58:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:58:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 10:58:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 85%|████████▍ | 867/1024 [39:29:30<7:32:49, 173.06s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002814630977809429, 'learning_rate': 1e-05, 'num_tokens': 794803786.0, 'completions/mean_length': 6878.734375, 'completions/min_length': 863.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6727.857421875, 'completions/min_terminated_length': 863.0, 'completions/max_terminated_length': 15241.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.29826050996780396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021359197795391083, 'sampling/sampling_logp_difference/max': 2.0843067169189453, 'sampling/importance_sampling_ratio/min': 0.12439332902431488, 'sampling/importance_sampling_ratio/mean': 0.9999914169311523, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.2921867563782143e-05, 'epoch': 0.8}
+
+ 85%|████████▍ | 867/1024 [39:29:30<7:32:49, 173.06s/it][AINFO 12-02 11:00:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:00:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 85%|████████▍ | 868/1024 [39:32:07<7:17:47, 168.38s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0009225695393979549, 'learning_rate': 1e-05, 'num_tokens': 795659533.0, 'completions/mean_length': 6510.0859375, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6108.70703125, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 16378.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02018270641565323, 'sampling/sampling_logp_difference/max': 9.486912727355957, 'sampling/importance_sampling_ratio/min': 7.583787373732775e-05, 'sampling/importance_sampling_ratio/mean': 0.9999755024909973, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.523924800674649e-05, 'epoch': 0.8}
+
+ 85%|████████▍ | 868/1024 [39:32:07<7:17:47, 168.38s/it][AINFO 12-02 11:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:03:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:03:24 [block_pool.py:292] Successfully reset prefix cache
+
+ 85%|████████▍ | 869/1024 [39:35:13<7:28:12, 173.50s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002187917474657297, 'learning_rate': 1e-05, 'num_tokens': 796626947.0, 'completions/mean_length': 7415.609375, 'completions/min_length': 647.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6896.7763671875, 'completions/min_terminated_length': 647.0, 'completions/max_terminated_length': 15693.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.287486732006073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020773276686668396, 'sampling/sampling_logp_difference/max': 2.6809229850769043, 'sampling/importance_sampling_ratio/min': 0.06849990040063858, 'sampling/importance_sampling_ratio/mean': 1.000030755996704, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.838271951892239e-05, 'epoch': 0.8}
+
+ 85%|████████▍ | 869/1024 [39:35:13<7:28:12, 173.50s/it][AINFO 12-02 11:06:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:06:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:06:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:06:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 85%|████████▍ | 870/1024 [39:38:13<7:30:48, 175.64s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015151570551097393, 'learning_rate': 1e-05, 'num_tokens': 797662547.0, 'completions/mean_length': 7944.6875, 'completions/min_length': 1169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7306.42041015625, 'completions/min_terminated_length': 1169.0, 'completions/max_terminated_length': 16236.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019307391718029976, 'sampling/sampling_logp_difference/max': 3.0490007400512695, 'sampling/importance_sampling_ratio/min': 0.04740627110004425, 'sampling/importance_sampling_ratio/mean': 1.0000882148742676, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.688029836572241e-05, 'epoch': 0.8}
+
+ 85%|████████▍ | 870/1024 [39:38:13<7:30:48, 175.64s/it][AINFO 12-02 11:09:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:09:30 [block_pool.py:292] Successfully reset prefix cache
+
+ 85%|████████▌ | 871/1024 [39:41:18<7:34:58, 178.42s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002237052656710148, 'learning_rate': 1e-05, 'num_tokens': 798842577.0, 'completions/mean_length': 9064.046875, 'completions/min_length': 1614.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8947.857421875, 'completions/min_terminated_length': 1614.0, 'completions/max_terminated_length': 16330.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.25436580181121826, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021627549082040787, 'sampling/sampling_logp_difference/max': 4.2458415031433105, 'sampling/importance_sampling_ratio/min': 0.014323674142360687, 'sampling/importance_sampling_ratio/mean': 1.0000358819961548, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.023992642032681e-05, 'epoch': 0.8}
+
+ 85%|████████▌ | 871/1024 [39:41:18<7:34:58, 178.42s/it][AINFO 12-02 11:12:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:12:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 85%|████████▌ | 872/1024 [39:43:57<7:17:08, 172.56s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0014427684945985675, 'learning_rate': 1e-05, 'num_tokens': 799776406.0, 'completions/mean_length': 7137.3515625, 'completions/min_length': 1123.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6990.57958984375, 'completions/min_terminated_length': 1123.0, 'completions/max_terminated_length': 15934.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.26932865381240845, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018964501097798347, 'sampling/sampling_logp_difference/max': 9.98162841796875, 'sampling/importance_sampling_ratio/min': 4.6241708332672715e-05, 'sampling/importance_sampling_ratio/mean': 1.0000889301300049, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.4043104026059154e-05, 'epoch': 0.8}
+
+ 85%|████████▌ | 872/1024 [39:43:57<7:17:08, 172.56s/it][AINFO 12-02 11:15:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:15:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:15:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:15:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 85%|████████▌ | 873/1024 [39:47:02<7:23:46, 176.33s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0011206723283976316, 'learning_rate': 1e-05, 'num_tokens': 800921392.0, 'completions/mean_length': 8788.703125, 'completions/min_length': 1298.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8479.951171875, 'completions/min_terminated_length': 1298.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3758672773838043, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021534522995352745, 'sampling/sampling_logp_difference/max': 4.332549571990967, 'sampling/importance_sampling_ratio/min': 0.013134018518030643, 'sampling/importance_sampling_ratio/mean': 1.0000324249267578, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.406853774227784e-05, 'epoch': 0.8}
+
+ 85%|████████▌ | 873/1024 [39:47:02<7:23:46, 176.33s/it][AINFO 12-02 11:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:18:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:18:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 85%|████████▌ | 874/1024 [39:49:47<7:11:43, 172.69s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001419111737050116, 'learning_rate': 1e-05, 'num_tokens': 801795989.0, 'completions/mean_length': 6691.5390625, 'completions/min_length': 825.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6214.8603515625, 'completions/min_terminated_length': 825.0, 'completions/max_terminated_length': 15815.0, 'rewards/accuracy_reward/mean': 0.6171875, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.6171875, 'reward_std': 0.21778054535388947, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017835767939686775, 'sampling/sampling_logp_difference/max': 1.874929428100586, 'sampling/importance_sampling_ratio/min': 0.15336579084396362, 'sampling/importance_sampling_ratio/mean': 0.9999727010726929, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.305129220938397e-05, 'epoch': 0.8}
+
+ 85%|████████▌ | 874/1024 [39:49:47<7:11:43, 172.69s/it][AINFO 12-02 11:21:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:21:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 85%|████████▌ | 875/1024 [39:52:43<7:11:47, 173.88s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0014789372216910124, 'learning_rate': 1e-05, 'num_tokens': 802833993.0, 'completions/mean_length': 7958.84375, 'completions/min_length': 608.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7471.4375, 'completions/min_terminated_length': 608.0, 'completions/max_terminated_length': 15956.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01966325007379055, 'sampling/sampling_logp_difference/max': 12.89694881439209, 'sampling/importance_sampling_ratio/min': 2.5056840513570933e-06, 'sampling/importance_sampling_ratio/mean': 1.0000438690185547, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.704439329041634e-05, 'epoch': 0.8}
+
+ 85%|████████▌ | 875/1024 [39:52:43<7:11:47, 173.88s/it][AINFO 12-02 11:24:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:24:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:24:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:24:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 86%|████████▌ | 876/1024 [39:55:35<7:07:30, 173.31s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0014852028107270598, 'learning_rate': 1e-05, 'num_tokens': 803879769.0, 'completions/mean_length': 7998.5625, 'completions/min_length': 1041.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7797.3125, 'completions/min_terminated_length': 1041.0, 'completions/max_terminated_length': 15988.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019409453496336937, 'sampling/sampling_logp_difference/max': 8.719453811645508, 'sampling/importance_sampling_ratio/min': 0.00016337640408892184, 'sampling/importance_sampling_ratio/mean': 0.9999555945396423, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.9549686056925566e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 876/1024 [39:55:35<7:07:30, 173.31s/it][AINFO 12-02 11:26:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:26:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 86%|████████▌ | 877/1024 [39:58:40<7:13:18, 176.86s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001812258386053145, 'learning_rate': 1e-05, 'num_tokens': 804986805.0, 'completions/mean_length': 8530.09375, 'completions/min_length': 722.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8006.50048828125, 'completions/min_terminated_length': 722.0, 'completions/max_terminated_length': 16067.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3232485055923462, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021551113575696945, 'sampling/sampling_logp_difference/max': 2.196342945098877, 'sampling/importance_sampling_ratio/min': 0.11120911687612534, 'sampling/importance_sampling_ratio/mean': 0.9999451041221619, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.714755429333309e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 877/1024 [39:58:40<7:13:18, 176.86s/it][AINFO 12-02 11:29:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:29:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:29:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:29:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 86%|████████▌ | 878/1024 [40:01:28<7:03:58, 174.24s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0014480475801974535, 'learning_rate': 1e-05, 'num_tokens': 805854353.0, 'completions/mean_length': 6578.03125, 'completions/min_length': 1357.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6095.77001953125, 'completions/min_terminated_length': 1357.0, 'completions/max_terminated_length': 15746.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.21040895581245422, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0201883427798748, 'sampling/sampling_logp_difference/max': 2.0096426010131836, 'sampling/importance_sampling_ratio/min': 0.19418850541114807, 'sampling/importance_sampling_ratio/mean': 1.0000325441360474, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.637187660388008e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 878/1024 [40:01:28<7:03:58, 174.24s/it][AINFO 12-02 11:32:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:32:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:32:45 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:32:45 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 86%|████████▌ | 879/1024 [40:04:30<7:06:35, 176.52s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0012852424988523126, 'learning_rate': 1e-05, 'num_tokens': 806906440.0, 'completions/mean_length': 8081.7421875, 'completions/min_length': 1578.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7143.2255859375, 'completions/min_terminated_length': 1578.0, 'completions/max_terminated_length': 16175.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020440982654690742, 'sampling/sampling_logp_difference/max': 1.5420751571655273, 'sampling/importance_sampling_ratio/min': 0.2139366865158081, 'sampling/importance_sampling_ratio/mean': 0.999962329864502, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.967755498659244e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 879/1024 [40:04:30<7:06:35, 176.52s/it][AINFO 12-02 11:35:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:35:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:35:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:35:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 86%|████████▌ | 880/1024 [40:07:26<7:02:54, 176.21s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0021891742944717407, 'learning_rate': 1e-05, 'num_tokens': 807734403.0, 'completions/mean_length': 6310.4609375, 'completions/min_length': 548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5985.5078125, 'completions/min_terminated_length': 548.0, 'completions/max_terminated_length': 16153.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.29302334785461426, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020060179755091667, 'sampling/sampling_logp_difference/max': 13.620281219482422, 'sampling/importance_sampling_ratio/min': 1.215589691128116e-06, 'sampling/importance_sampling_ratio/mean': 0.9999933242797852, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.653677837675787e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 880/1024 [40:07:26<7:02:54, 176.21s/it][AINFO 12-02 11:38:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:38:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:38:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:38:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 86%|████████▌ | 881/1024 [40:10:45<7:16:12, 183.03s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0012711554300040007, 'learning_rate': 1e-05, 'num_tokens': 808822377.0, 'completions/mean_length': 8349.671875, 'completions/min_length': 719.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7814.05029296875, 'completions/min_terminated_length': 719.0, 'completions/max_terminated_length': 16326.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02049967646598816, 'sampling/sampling_logp_difference/max': 3.213592529296875, 'sampling/importance_sampling_ratio/min': 0.040211889892816544, 'sampling/importance_sampling_ratio/mean': 0.9999496936798096, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.916141526540741e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 881/1024 [40:10:45<7:16:12, 183.03s/it][AINFO 12-02 11:42:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:42:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:42:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:42:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 86%|████████▌ | 882/1024 [40:13:41<7:08:22, 181.00s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0011046364670619369, 'learning_rate': 1e-05, 'num_tokens': 809729144.0, 'completions/mean_length': 6932.1796875, 'completions/min_length': 975.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6705.33642578125, 'completions/min_terminated_length': 975.0, 'completions/max_terminated_length': 15780.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.1820138692855835, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020482853055000305, 'sampling/sampling_logp_difference/max': 2.003894805908203, 'sampling/importance_sampling_ratio/min': 0.1348091959953308, 'sampling/importance_sampling_ratio/mean': 0.9999459981918335, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.088586911166203e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 882/1024 [40:13:41<7:08:22, 181.00s/it][AINFO 12-02 11:44:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:44:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 86%|████████▌ | 883/1024 [40:16:49<7:10:02, 183.00s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0017135550733655691, 'learning_rate': 1e-05, 'num_tokens': 810867356.0, 'completions/mean_length': 8751.46875, 'completions/min_length': 982.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 8174.21875, 'completions/min_terminated_length': 982.0, 'completions/max_terminated_length': 15891.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3077537715435028, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0199655182659626, 'sampling/sampling_logp_difference/max': 2.482398271560669, 'sampling/importance_sampling_ratio/min': 0.08354262262582779, 'sampling/importance_sampling_ratio/mean': 0.999957799911499, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.397821257702162e-05, 'epoch': 0.81}
+
+ 86%|████████▌ | 883/1024 [40:16:49<7:10:02, 183.00s/it][AINFO 12-02 11:48:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:48:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:48:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:48:05 [block_pool.py:292] Successfully reset prefix cache
+
+ 86%|████████▋ | 884/1024 [40:19:46<7:03:09, 181.35s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0007195719517767429, 'learning_rate': 1e-05, 'num_tokens': 812013437.0, 'completions/mean_length': 8768.2578125, 'completions/min_length': 1793.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8393.712890625, 'completions/min_terminated_length': 1793.0, 'completions/max_terminated_length': 15687.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.20517179369926453, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021406741812825203, 'sampling/sampling_logp_difference/max': 2.8916714191436768, 'sampling/importance_sampling_ratio/min': 0.05548340082168579, 'sampling/importance_sampling_ratio/mean': 0.9999544024467468, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.851640053653682e-05, 'epoch': 0.81}
+
+ 86%|████████▋ | 884/1024 [40:19:46<7:03:09, 181.35s/it][AINFO 12-02 11:51:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:51:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:51:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:51:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 86%|████████▋ | 885/1024 [40:22:24<6:43:47, 174.30s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001177979982458055, 'learning_rate': 1e-05, 'num_tokens': 813003241.0, 'completions/mean_length': 7550.28125, 'completions/min_length': 1400.0, 'completions/max_length': 15712.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7550.28125, 'completions/min_terminated_length': 1400.0, 'completions/max_terminated_length': 15712.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.19780512154102325, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020617671310901642, 'sampling/sampling_logp_difference/max': 4.819994926452637, 'sampling/importance_sampling_ratio/min': 0.008066828362643719, 'sampling/importance_sampling_ratio/mean': 1.0000030994415283, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.697703354850091e-05, 'epoch': 0.81}
+
+ 86%|████████▋ | 885/1024 [40:22:24<6:43:47, 174.30s/it][AINFO 12-02 11:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:53:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:53:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 87%|████████▋ | 886/1024 [40:25:17<6:40:03, 173.94s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001567984465509653, 'learning_rate': 1e-05, 'num_tokens': 813944369.0, 'completions/mean_length': 7194.125, 'completions/min_length': 934.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6662.47900390625, 'completions/min_terminated_length': 934.0, 'completions/max_terminated_length': 15966.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3227117359638214, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018128447234630585, 'sampling/sampling_logp_difference/max': 13.84287166595459, 'sampling/importance_sampling_ratio/min': 9.730098327054293e-07, 'sampling/importance_sampling_ratio/mean': 1.0000003576278687, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.242897507036105e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 886/1024 [40:25:17<6:40:03, 173.94s/it][AINFO 12-02 11:56:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:56:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:56:34 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:56:34 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 87%|████████▋ | 887/1024 [40:28:08<6:35:20, 173.15s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0012894226238131523, 'learning_rate': 1e-05, 'num_tokens': 814940814.0, 'completions/mean_length': 7615.3515625, 'completions/min_length': 879.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7476.1669921875, 'completions/min_terminated_length': 879.0, 'completions/max_terminated_length': 16250.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020850371569395065, 'sampling/sampling_logp_difference/max': 3.779207944869995, 'sampling/importance_sampling_ratio/min': 0.022840775549411774, 'sampling/importance_sampling_ratio/mean': 1.0000061988830566, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.629802413091966e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 887/1024 [40:28:08<6:35:20, 173.15s/it][AINFO 12-02 11:59:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:59:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:59:25 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 11:59:25 [block_pool.py:292] Successfully reset prefix cache
+
+ 87%|████████▋ | 888/1024 [40:30:50<6:24:39, 169.70s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0013706967001780868, 'learning_rate': 1e-05, 'num_tokens': 815885015.0, 'completions/mean_length': 7217.3828125, 'completions/min_length': 1546.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7145.20458984375, 'completions/min_terminated_length': 1546.0, 'completions/max_terminated_length': 15546.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.22278036177158356, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02118900790810585, 'sampling/sampling_logp_difference/max': 2.2236270904541016, 'sampling/importance_sampling_ratio/min': 0.11758646368980408, 'sampling/importance_sampling_ratio/mean': 1.0001291036605835, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.025504745186481e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 888/1024 [40:30:50<6:24:39, 169.70s/it][AINFO 12-02 12:02:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:02:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:02:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:02:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 87%|████████▋ | 889/1024 [40:33:45<6:25:04, 171.14s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002103450708091259, 'learning_rate': 1e-05, 'num_tokens': 816794864.0, 'completions/mean_length': 6969.1953125, 'completions/min_length': 697.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6665.49169921875, 'completions/min_terminated_length': 697.0, 'completions/max_terminated_length': 15931.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.26932865381240845, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018807904794812202, 'sampling/sampling_logp_difference/max': 3.2177529335021973, 'sampling/importance_sampling_ratio/min': 0.04004494100809097, 'sampling/importance_sampling_ratio/mean': 0.9999291300773621, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.904328852717299e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 889/1024 [40:33:45<6:25:04, 171.14s/it][AINFO 12-02 12:05:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:05:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:05:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:05:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 87%|████████▋ | 890/1024 [40:36:36<6:22:20, 171.20s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017690680688247085, 'learning_rate': 1e-05, 'num_tokens': 817582858.0, 'completions/mean_length': 6023.203125, 'completions/min_length': 1301.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5688.98388671875, 'completions/min_terminated_length': 1301.0, 'completions/max_terminated_length': 16324.0, 'rewards/accuracy_reward/mean': 0.6328125, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.6328125, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.016954712569713593, 'sampling/sampling_logp_difference/max': 1.8426182270050049, 'sampling/importance_sampling_ratio/min': 0.15840215981006622, 'sampling/importance_sampling_ratio/mean': 0.9999496340751648, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.8294816641173384e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 890/1024 [40:36:36<6:22:20, 171.20s/it][AINFO 12-02 12:07:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:07:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:07:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:07:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 87%|████████▋ | 891/1024 [40:39:38<6:26:34, 174.40s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0006933769909664989, 'learning_rate': 1e-05, 'num_tokens': 818644084.0, 'completions/mean_length': 8125.703125, 'completions/min_length': 748.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7647.9501953125, 'completions/min_terminated_length': 748.0, 'completions/max_terminated_length': 16207.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.19728106260299683, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019947785884141922, 'sampling/sampling_logp_difference/max': 4.900289535522461, 'sampling/importance_sampling_ratio/min': 0.007444427348673344, 'sampling/importance_sampling_ratio/mean': 0.9998827576637268, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.674523566132848e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 891/1024 [40:39:38<6:26:34, 174.40s/it][AINFO 12-02 12:10:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:10:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:10:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:10:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 87%|████████▋ | 892/1024 [40:42:26<6:19:24, 172.46s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.000776404223870486, 'learning_rate': 1e-05, 'num_tokens': 819650427.0, 'completions/mean_length': 7701.4921875, 'completions/min_length': 1463.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7274.4833984375, 'completions/min_terminated_length': 1463.0, 'completions/max_terminated_length': 15913.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01964922994375229, 'sampling/sampling_logp_difference/max': 2.475506067276001, 'sampling/importance_sampling_ratio/min': 0.08412040770053864, 'sampling/importance_sampling_ratio/mean': 1.0000033378601074, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.760212030101684e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 892/1024 [40:42:26<6:19:24, 172.46s/it][AINFO 12-02 12:13:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:13:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:13:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:13:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 87%|████████▋ | 893/1024 [40:45:02<6:05:54, 167.60s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015620605554431677, 'learning_rate': 1e-05, 'num_tokens': 820548338.0, 'completions/mean_length': 6864.6796875, 'completions/min_length': 1637.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6789.724609375, 'completions/min_terminated_length': 1637.0, 'completions/max_terminated_length': 15295.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02017066441476345, 'sampling/sampling_logp_difference/max': 1.451103687286377, 'sampling/importance_sampling_ratio/min': 0.2343115359544754, 'sampling/importance_sampling_ratio/mean': 0.9999508261680603, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.892902373081597e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 893/1024 [40:45:02<6:05:54, 167.60s/it][AINFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:16:19 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 87%|████████▋ | 894/1024 [40:47:47<6:01:24, 166.81s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001096326974220574, 'learning_rate': 1e-05, 'num_tokens': 821445773.0, 'completions/mean_length': 6872.5234375, 'completions/min_length': 1252.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6485.8779296875, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 15610.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01957777887582779, 'sampling/sampling_logp_difference/max': 12.223203659057617, 'sampling/importance_sampling_ratio/min': 4.915074896416627e-06, 'sampling/importance_sampling_ratio/mean': 0.9999077916145325, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.465205347514711e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 894/1024 [40:47:47<6:01:24, 166.81s/it][AINFO 12-02 12:19:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:19:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 87%|████████▋ | 895/1024 [40:50:43<6:04:43, 169.64s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0014929590979591012, 'learning_rate': 1e-05, 'num_tokens': 822344245.0, 'completions/mean_length': 6855.3125, 'completions/min_length': 1718.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6626.62451171875, 'completions/min_terminated_length': 1718.0, 'completions/max_terminated_length': 16313.0, 'rewards/accuracy_reward/mean': 0.6015625, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.6015625, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01793181151151657, 'sampling/sampling_logp_difference/max': 6.588542461395264, 'sampling/importance_sampling_ratio/min': 0.0013760441215708852, 'sampling/importance_sampling_ratio/mean': 1.000010371208191, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.216520841917372e-05, 'epoch': 0.82}
+
+ 87%|████████▋ | 895/1024 [40:50:43<6:04:43, 169.64s/it][AINFO 12-02 12:22:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:22:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 88%|████████▊ | 896/1024 [40:53:20<5:53:27, 165.69s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017471398459747434, 'learning_rate': 1e-05, 'num_tokens': 823228439.0, 'completions/mean_length': 6750.515625, 'completions/min_length': 1079.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6358.91015625, 'completions/min_terminated_length': 1079.0, 'completions/max_terminated_length': 16226.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.29432153701782227, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018637537956237793, 'sampling/sampling_logp_difference/max': 7.35062313079834, 'sampling/importance_sampling_ratio/min': 0.0006421920843422413, 'sampling/importance_sampling_ratio/mean': 1.0000522136688232, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7327427435229765e-05, 'epoch': 0.82}
+
+ 88%|████████▊ | 896/1024 [40:53:20<5:53:27, 165.69s/it][AINFO 12-02 12:24:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:24:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:24:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:24:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 88%|████████▊ | 897/1024 [40:56:00<5:47:24, 164.13s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002119277138262987, 'learning_rate': 1e-05, 'num_tokens': 824182613.0, 'completions/mean_length': 7311.546875, 'completions/min_length': 1298.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7240.1103515625, 'completions/min_terminated_length': 1298.0, 'completions/max_terminated_length': 16180.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3419179320335388, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020174939185380936, 'sampling/sampling_logp_difference/max': 2.479416847229004, 'sampling/importance_sampling_ratio/min': 0.0837920755147934, 'sampling/importance_sampling_ratio/mean': 1.000133991241455, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.423028210316261e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 897/1024 [40:56:00<5:47:24, 164.13s/it][AINFO 12-02 12:27:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:17 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:27:17 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 88%|████████▊ | 898/1024 [40:59:02<5:55:41, 169.38s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0021438850089907646, 'learning_rate': 1e-05, 'num_tokens': 825217269.0, 'completions/mean_length': 7946.625, 'completions/min_length': 1162.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7458.51220703125, 'completions/min_terminated_length': 1162.0, 'completions/max_terminated_length': 16382.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.20357418060302734, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02016165852546692, 'sampling/sampling_logp_difference/max': 4.65170431137085, 'sampling/importance_sampling_ratio/min': 0.009545319713652134, 'sampling/importance_sampling_ratio/mean': 1.0000168085098267, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.881111331338616e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 898/1024 [40:59:02<5:55:41, 169.38s/it][AINFO 12-02 12:30:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:30:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:30:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:30:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 88%|████████▊ | 899/1024 [41:02:22<6:12:08, 178.63s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016181566752493382, 'learning_rate': 1e-05, 'num_tokens': 826415809.0, 'completions/mean_length': 9202.09375, 'completions/min_length': 848.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 8593.4580078125, 'completions/min_terminated_length': 848.0, 'completions/max_terminated_length': 15883.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2551448345184326, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02000996097922325, 'sampling/sampling_logp_difference/max': 2.3608081340789795, 'sampling/importance_sampling_ratio/min': 0.17713500559329987, 'sampling/importance_sampling_ratio/mean': 0.9999855756759644, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.78673428321963e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 899/1024 [41:02:22<6:12:08, 178.63s/it][AINFO 12-02 12:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:33:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:33:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 88%|████████▊ | 900/1024 [41:05:36<6:18:38, 183.21s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001991080353036523, 'learning_rate': 1e-05, 'num_tokens': 827391542.0, 'completions/mean_length': 7474.0390625, 'completions/min_length': 862.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6800.1767578125, 'completions/min_terminated_length': 862.0, 'completions/max_terminated_length': 16005.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020357441157102585, 'sampling/sampling_logp_difference/max': 2.400463581085205, 'sampling/importance_sampling_ratio/min': 0.09067591279745102, 'sampling/importance_sampling_ratio/mean': 0.999974250793457, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.608732282600613e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 900/1024 [41:05:36<6:18:38, 183.21s/it][AINFO 12-02 12:36:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:36:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:36:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:36:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 88%|████████▊ | 901/1024 [41:07:58<5:50:21, 170.91s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0021326204296201468, 'learning_rate': 1e-05, 'num_tokens': 828192876.0, 'completions/mean_length': 6094.171875, 'completions/min_length': 1425.0, 'completions/max_length': 14294.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6094.171875, 'completions/min_terminated_length': 1425.0, 'completions/max_terminated_length': 14294.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.3827020525932312, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019372478127479553, 'sampling/sampling_logp_difference/max': 6.282872676849365, 'sampling/importance_sampling_ratio/min': 0.0018680266803130507, 'sampling/importance_sampling_ratio/mean': 1.0000417232513428, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.43767554038277e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 901/1024 [41:07:58<5:50:21, 170.91s/it][AINFO 12-02 12:39:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:39:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:39:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:39:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 88%|████████▊ | 902/1024 [41:10:54<5:50:20, 172.30s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001343090203590691, 'learning_rate': 1e-05, 'num_tokens': 829165286.0, 'completions/mean_length': 7461.703125, 'completions/min_length': 812.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7247.568359375, 'completions/min_terminated_length': 812.0, 'completions/max_terminated_length': 15806.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020001709461212158, 'sampling/sampling_logp_difference/max': 2.050848960876465, 'sampling/importance_sampling_ratio/min': 0.17814111709594727, 'sampling/importance_sampling_ratio/mean': 0.9999566674232483, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.402358993298549e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 902/1024 [41:10:54<5:50:20, 172.30s/it][AINFO 12-02 12:42:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:42:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 88%|████████▊ | 903/1024 [41:13:41<5:44:18, 170.73s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0015660603530704975, 'learning_rate': 1e-05, 'num_tokens': 830079700.0, 'completions/mean_length': 6997.984375, 'completions/min_length': 1305.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6772.72021484375, 'completions/min_terminated_length': 1305.0, 'completions/max_terminated_length': 15026.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.30221718549728394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020265769213438034, 'sampling/sampling_logp_difference/max': 4.77354621887207, 'sampling/importance_sampling_ratio/min': 0.008450360037386417, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.7971429839653865e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 903/1024 [41:13:41<5:44:18, 170.73s/it][AINFO 12-02 12:44:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:44:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:44:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:44:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 88%|████████▊ | 904/1024 [41:16:31<5:40:58, 170.49s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0019977898336946964, 'learning_rate': 1e-05, 'num_tokens': 831060944.0, 'completions/mean_length': 7519.09375, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6767.83056640625, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.27274850010871887, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02029348351061344, 'sampling/sampling_logp_difference/max': 1.836850643157959, 'sampling/importance_sampling_ratio/min': 0.1989213526248932, 'sampling/importance_sampling_ratio/mean': 1.0000035762786865, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7778202681693074e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 904/1024 [41:16:31<5:40:58, 170.49s/it][AINFO 12-02 12:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:47:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 88%|████████▊ | 905/1024 [41:19:26<5:41:18, 172.09s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0027383514679968357, 'learning_rate': 1e-05, 'num_tokens': 832093004.0, 'completions/mean_length': 7904.21875, 'completions/min_length': 747.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7262.89111328125, 'completions/min_terminated_length': 747.0, 'completions/max_terminated_length': 16246.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.27670514583587646, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019547710195183754, 'sampling/sampling_logp_difference/max': 8.503087997436523, 'sampling/importance_sampling_ratio/min': 0.00020284102356527, 'sampling/importance_sampling_ratio/mean': 0.9999693632125854, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.78102676627168e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 905/1024 [41:19:26<5:41:18, 172.09s/it][AINFO 12-02 12:50:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:50:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:50:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:50:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 88%|████████▊ | 906/1024 [41:22:36<5:48:54, 177.41s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015192253049463034, 'learning_rate': 1e-05, 'num_tokens': 833142599.0, 'completions/mean_length': 8050.0859375, 'completions/min_length': 1473.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7711.30859375, 'completions/min_terminated_length': 1473.0, 'completions/max_terminated_length': 16179.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.23592589795589447, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01816430315375328, 'sampling/sampling_logp_difference/max': 2.871675729751587, 'sampling/importance_sampling_ratio/min': 0.056603994220495224, 'sampling/importance_sampling_ratio/mean': 0.9999800324440002, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.644028902272112e-05, 'epoch': 0.83}
+
+ 88%|████████▊ | 906/1024 [41:22:36<5:48:54, 177.41s/it][AINFO 12-02 12:53:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:53:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:53:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:53:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 89%|████████▊ | 907/1024 [41:24:47<5:18:43, 163.45s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001484649139456451, 'learning_rate': 1e-05, 'num_tokens': 833801241.0, 'completions/mean_length': 5007.765625, 'completions/min_length': 766.0, 'completions/max_length': 14340.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5007.765625, 'completions/min_terminated_length': 766.0, 'completions/max_terminated_length': 14340.0, 'rewards/accuracy_reward/mean': 0.8125, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.8125, 'reward_std': 0.1751839816570282, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018119309097528458, 'sampling/sampling_logp_difference/max': 2.2782411575317383, 'sampling/importance_sampling_ratio/min': 0.10246426612138748, 'sampling/importance_sampling_ratio/mean': 0.9999354481697083, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.198864806108759e-05, 'epoch': 0.83}
+
+ 89%|████████▊ | 907/1024 [41:24:47<5:18:43, 163.45s/it][AINFO 12-02 12:56:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:56:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:56:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:56:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 89%|████████▊ | 908/1024 [41:27:41<5:22:03, 166.58s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0016140672378242016, 'learning_rate': 1e-05, 'num_tokens': 834920327.0, 'completions/mean_length': 8611.171875, 'completions/min_length': 825.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8295.203125, 'completions/min_terminated_length': 825.0, 'completions/max_terminated_length': 16055.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2359209954738617, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019699856638908386, 'sampling/sampling_logp_difference/max': 1.810068130493164, 'sampling/importance_sampling_ratio/min': 0.1906553953886032, 'sampling/importance_sampling_ratio/mean': 1.0000495910644531, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.993544032709906e-05, 'epoch': 0.84}
+
+ 89%|████████▊ | 908/1024 [41:27:41<5:22:03, 166.58s/it][AINFO 12-02 12:58:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:58:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:58:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 12:58:58 [block_pool.py:292] Successfully reset prefix cache
+
+ 89%|████████▉ | 909/1024 [41:30:39<5:25:41, 169.92s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0014925249852240086, 'learning_rate': 1e-05, 'num_tokens': 836057251.0, 'completions/mean_length': 8735.09375, 'completions/min_length': 781.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 8156.60546875, 'completions/min_terminated_length': 781.0, 'completions/max_terminated_length': 16119.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020331356674432755, 'sampling/sampling_logp_difference/max': 8.84995174407959, 'sampling/importance_sampling_ratio/min': 0.00014338866458274424, 'sampling/importance_sampling_ratio/mean': 0.9999048113822937, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.0253051969993976e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 909/1024 [41:30:39<5:25:41, 169.92s/it][AINFO 12-02 13:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:01:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:01:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 89%|████████▉ | 910/1024 [41:33:49<5:34:29, 176.05s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0030293921008706093, 'learning_rate': 1e-05, 'num_tokens': 836960333.0, 'completions/mean_length': 6894.578125, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6090.3896484375, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 16097.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.3090568780899048, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019232023507356644, 'sampling/sampling_logp_difference/max': 3.2053380012512207, 'sampling/importance_sampling_ratio/min': 0.04054519534111023, 'sampling/importance_sampling_ratio/mean': 1.000047206878662, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.713561919648782e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 910/1024 [41:33:49<5:34:29, 176.05s/it][AINFO 12-02 13:05:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:05:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:05:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:05:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 89%|████████▉ | 911/1024 [41:36:30<5:22:59, 171.50s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0033167500514537096, 'learning_rate': 1e-05, 'num_tokens': 837833298.0, 'completions/mean_length': 6662.6015625, 'completions/min_length': 1194.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6429.2880859375, 'completions/min_terminated_length': 1194.0, 'completions/max_terminated_length': 16212.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.34480881690979004, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020060744136571884, 'sampling/sampling_logp_difference/max': 9.39183521270752, 'sampling/importance_sampling_ratio/min': 8.340225758729503e-05, 'sampling/importance_sampling_ratio/mean': 0.9999831914901733, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.809440076769533e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 911/1024 [41:36:30<5:22:59, 171.50s/it][AINFO 12-02 13:07:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:07:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:07:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:07:47 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 89%|████████▉ | 912/1024 [41:39:45<5:33:28, 178.65s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0008104958105832338, 'learning_rate': 1e-05, 'num_tokens': 838989828.0, 'completions/mean_length': 8879.453125, 'completions/min_length': 1488.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8379.150390625, 'completions/min_terminated_length': 1488.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.022497693076729774, 'sampling/sampling_logp_difference/max': 2.1741952896118164, 'sampling/importance_sampling_ratio/min': 0.11369960755109787, 'sampling/importance_sampling_ratio/mean': 1.0000214576721191, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.281003759842861e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 912/1024 [41:39:45<5:33:28, 178.65s/it][AINFO 12-02 13:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:11:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:11:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 89%|████████▉ | 913/1024 [41:42:13<5:13:09, 169.28s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0010485589737072587, 'learning_rate': 1e-05, 'num_tokens': 839856838.0, 'completions/mean_length': 6628.640625, 'completions/min_length': 1349.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6551.82666015625, 'completions/min_terminated_length': 1349.0, 'completions/max_terminated_length': 16218.0, 'rewards/accuracy_reward/mean': 0.6015625, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.6015625, 'reward_std': 0.19332444667816162, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019367247819900513, 'sampling/sampling_logp_difference/max': 1.5068955421447754, 'sampling/importance_sampling_ratio/min': 0.2215968519449234, 'sampling/importance_sampling_ratio/mean': 0.999991774559021, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.370107444628957e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 913/1024 [41:42:13<5:13:09, 169.28s/it][AINFO 12-02 13:13:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:13:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:13:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:13:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 89%|████████▉ | 914/1024 [41:44:54<5:05:48, 166.81s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001419647946022451, 'learning_rate': 1e-05, 'num_tokens': 840773173.0, 'completions/mean_length': 7013.0546875, 'completions/min_length': 905.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6710.76611328125, 'completions/min_terminated_length': 905.0, 'completions/max_terminated_length': 15288.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.25620073080062866, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01902669295668602, 'sampling/sampling_logp_difference/max': 1.5141334533691406, 'sampling/importance_sampling_ratio/min': 0.21999874711036682, 'sampling/importance_sampling_ratio/mean': 1.0000745058059692, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.859852265326481e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 914/1024 [41:44:54<5:05:48, 166.81s/it][AINFO 12-02 13:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:16:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:16:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 89%|████████▉ | 915/1024 [41:47:48<5:06:59, 168.98s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001529378117993474, 'learning_rate': 1e-05, 'num_tokens': 841741903.0, 'completions/mean_length': 7409.078125, 'completions/min_length': 655.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6730.302734375, 'completions/min_terminated_length': 655.0, 'completions/max_terminated_length': 16173.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.36007601022720337, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01980706676840782, 'sampling/sampling_logp_difference/max': 2.4111902713775635, 'sampling/importance_sampling_ratio/min': 0.08970845490694046, 'sampling/importance_sampling_ratio/mean': 0.9999992847442627, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.550180200472823e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 915/1024 [41:47:48<5:06:59, 168.98s/it][AINFO 12-02 13:19:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:19:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:19:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:19:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 89%|████████▉ | 916/1024 [41:50:43<5:07:23, 170.78s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013091018190607429, 'learning_rate': 1e-05, 'num_tokens': 842826254.0, 'completions/mean_length': 8314.3046875, 'completions/min_length': 970.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8053.99169921875, 'completions/min_terminated_length': 970.0, 'completions/max_terminated_length': 16317.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020361894741654396, 'sampling/sampling_logp_difference/max': 7.1491618156433105, 'sampling/importance_sampling_ratio/min': 0.000785522221121937, 'sampling/importance_sampling_ratio/mean': 1.0000264644622803, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.824253483093344e-05, 'epoch': 0.84}
+
+ 89%|████████▉ | 916/1024 [41:50:43<5:07:23, 170.78s/it][AINFO 12-02 13:21:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:21:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:21:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:21:59 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 90%|████████▉ | 917/1024 [41:53:25<4:59:44, 168.08s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0011579126585274935, 'learning_rate': 1e-05, 'num_tokens': 843815689.0, 'completions/mean_length': 7555.0859375, 'completions/min_length': 1744.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6966.4921875, 'completions/min_terminated_length': 1744.0, 'completions/max_terminated_length': 16105.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.25065141916275024, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01928183250129223, 'sampling/sampling_logp_difference/max': 3.641432762145996, 'sampling/importance_sampling_ratio/min': 0.026214757934212685, 'sampling/importance_sampling_ratio/mean': 0.9999576807022095, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.856658071934362e-05, 'epoch': 0.84}
+
+ 90%|████████▉ | 917/1024 [41:53:25<4:59:44, 168.08s/it][AINFO 12-02 13:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:24:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:24:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 90%|████████▉ | 918/1024 [41:56:12<4:56:34, 167.87s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002563695190474391, 'learning_rate': 1e-05, 'num_tokens': 844655142.0, 'completions/mean_length': 6411.4765625, 'completions/min_length': 1061.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6253.18310546875, 'completions/min_terminated_length': 1061.0, 'completions/max_terminated_length': 15850.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.34245961904525757, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019931353628635406, 'sampling/sampling_logp_difference/max': 5.113109588623047, 'sampling/importance_sampling_ratio/min': 0.006017342675477266, 'sampling/importance_sampling_ratio/mean': 1.000047206878662, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.9384549103924655e-05, 'epoch': 0.84}
+
+ 90%|████████▉ | 918/1024 [41:56:12<4:56:34, 167.87s/it][AINFO 12-02 13:27:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:29 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:27:29 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 90%|████████▉ | 919/1024 [41:59:36<5:12:59, 178.85s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001785408123396337, 'learning_rate': 1e-05, 'num_tokens': 845800741.0, 'completions/mean_length': 8828.6796875, 'completions/min_length': 1160.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 8257.26953125, 'completions/min_terminated_length': 1160.0, 'completions/max_terminated_length': 16377.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2811809182167053, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019120289012789726, 'sampling/sampling_logp_difference/max': 18.768327713012695, 'sampling/importance_sampling_ratio/min': 7.063482243552244e-09, 'sampling/importance_sampling_ratio/mean': 0.9999595880508423, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.4683818291323405e-05, 'epoch': 0.85}
+
+ 90%|████████▉ | 919/1024 [41:59:36<5:12:59, 178.85s/it][AINFO 12-02 13:30:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:30:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:30:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:30:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 90%|████████▉ | 920/1024 [42:02:27<5:05:27, 176.22s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001847413252107799, 'learning_rate': 1e-05, 'num_tokens': 846746347.0, 'completions/mean_length': 7259.046875, 'completions/min_length': 567.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6964.693359375, 'completions/min_terminated_length': 567.0, 'completions/max_terminated_length': 16099.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019917942583560944, 'sampling/sampling_logp_difference/max': 9.966014862060547, 'sampling/importance_sampling_ratio/min': 4.6969369577709585e-05, 'sampling/importance_sampling_ratio/mean': 0.9999884963035583, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.128687502699904e-05, 'epoch': 0.85}
+
+ 90%|████████▉ | 920/1024 [42:02:27<5:05:27, 176.22s/it][AINFO 12-02 13:33:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:33:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:33:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:33:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 90%|████████▉ | 921/1024 [42:05:14<4:57:58, 173.58s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016390514792874455, 'learning_rate': 1e-05, 'num_tokens': 847652966.0, 'completions/mean_length': 6928.3984375, 'completions/min_length': 929.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6623.37890625, 'completions/min_terminated_length': 929.0, 'completions/max_terminated_length': 16223.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.18884867429733276, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02050882950425148, 'sampling/sampling_logp_difference/max': 8.076475143432617, 'sampling/importance_sampling_ratio/min': 0.0003107645025011152, 'sampling/importance_sampling_ratio/mean': 1.000011682510376, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.685199956431461e-05, 'epoch': 0.85}
+
+ 90%|████████▉ | 921/1024 [42:05:14<4:57:58, 173.58s/it][AINFO 12-02 13:36:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:36:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 90%|█████████ | 922/1024 [42:08:19<5:01:01, 177.08s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002139459131285548, 'learning_rate': 1e-05, 'num_tokens': 848772279.0, 'completions/mean_length': 8591.1328125, 'completions/min_length': 778.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 7634.1142578125, 'completions/min_terminated_length': 778.0, 'completions/max_terminated_length': 16348.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019883275032043457, 'sampling/sampling_logp_difference/max': 4.884869575500488, 'sampling/importance_sampling_ratio/min': 0.007560109719634056, 'sampling/importance_sampling_ratio/mean': 1.0000073909759521, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.832270269616856e-05, 'epoch': 0.85}
+
+ 90%|█████████ | 922/1024 [42:08:19<5:01:01, 177.08s/it][AINFO 12-02 13:39:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:39:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:39:36 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:39:36 [block_pool.py:292] Successfully reset prefix cache
+
+ 90%|█████████ | 923/1024 [42:10:47<4:43:30, 168.42s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002933247247710824, 'learning_rate': 1e-05, 'num_tokens': 849581855.0, 'completions/mean_length': 6176.875, 'completions/min_length': 968.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5931.904296875, 'completions/min_terminated_length': 968.0, 'completions/max_terminated_length': 15947.0, 'rewards/accuracy_reward/mean': 0.6640625, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.6640625, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01915135234594345, 'sampling/sampling_logp_difference/max': 6.34769868850708, 'sampling/importance_sampling_ratio/min': 0.0017507716547697783, 'sampling/importance_sampling_ratio/mean': 1.0000362396240234, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.61821696894549e-05, 'epoch': 0.85}
+
+ 90%|█████████ | 923/1024 [42:10:47<4:43:30, 168.42s/it][AINFO 12-02 13:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:42:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:42:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 90%|█████████ | 924/1024 [42:13:34<4:39:37, 167.78s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017085481667891145, 'learning_rate': 1e-05, 'num_tokens': 850557633.0, 'completions/mean_length': 7438.015625, 'completions/min_length': 1457.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7296.01611328125, 'completions/min_terminated_length': 1457.0, 'completions/max_terminated_length': 14896.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018810277804732323, 'sampling/sampling_logp_difference/max': 3.3589601516723633, 'sampling/importance_sampling_ratio/min': 0.0347713977098465, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7123007107074955e-05, 'epoch': 0.85}
+
+ 90%|█████████ | 924/1024 [42:13:34<4:39:37, 167.78s/it][AINFO 12-02 13:44:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:44:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:44:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:44:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 90%|█████████ | 925/1024 [42:16:48<4:50:03, 175.79s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0022886833176016808, 'learning_rate': 1e-05, 'num_tokens': 851639199.0, 'completions/mean_length': 8309.921875, 'completions/min_length': 734.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7625.67822265625, 'completions/min_terminated_length': 734.0, 'completions/max_terminated_length': 16068.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.23857945203781128, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021527377888560295, 'sampling/sampling_logp_difference/max': 9.305204391479492, 'sampling/importance_sampling_ratio/min': 9.094965935219079e-05, 'sampling/importance_sampling_ratio/mean': 0.9999011754989624, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6661302551219705e-05, 'epoch': 0.85}
+
+ 90%|█████████ | 925/1024 [42:16:48<4:50:03, 175.79s/it][AINFO 12-02 13:48:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:48:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:48:05 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:48:05 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 90%|█████████ | 926/1024 [42:19:31<4:40:41, 171.85s/it][A
+                                                        [A{'loss': -0.0001, 'grad_norm': 0.0026655655819922686, 'learning_rate': 1e-05, 'num_tokens': 852540315.0, 'completions/mean_length': 6905.09375, 'completions/min_length': 1490.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6356.72705078125, 'completions/min_terminated_length': 1490.0, 'completions/max_terminated_length': 15158.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.24541422724723816, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01990903541445732, 'sampling/sampling_logp_difference/max': 2.2250614166259766, 'sampling/importance_sampling_ratio/min': 0.1829291582107544, 'sampling/importance_sampling_ratio/mean': 0.9999808669090271, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.973254001721216e-05, 'epoch': 0.85}
+
+ 90%|█████████ | 926/1024 [42:19:31<4:40:41, 171.85s/it][AINFO 12-02 13:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:47 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:50:47 [block_pool.py:292] Successfully reset prefix cache
+
+ 91%|█████████ | 927/1024 [42:22:23<4:37:48, 171.84s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001605476252734661, 'learning_rate': 1e-05, 'num_tokens': 853466236.0, 'completions/mean_length': 7085.8203125, 'completions/min_length': 1540.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6862.66455078125, 'completions/min_terminated_length': 1540.0, 'completions/max_terminated_length': 15873.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.26645541191101074, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020266639068722725, 'sampling/sampling_logp_difference/max': 1.984119176864624, 'sampling/importance_sampling_ratio/min': 0.13750167191028595, 'sampling/importance_sampling_ratio/mean': 1.0000020265579224, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.369052206560809e-05, 'epoch': 0.85}
+
+ 91%|█████████ | 927/1024 [42:22:23<4:37:48, 171.84s/it][AINFO 12-02 13:53:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:53:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:53:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:53:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 91%|█████████ | 928/1024 [42:25:37<4:45:51, 178.66s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0029054069891572, 'learning_rate': 1e-05, 'num_tokens': 854504456.0, 'completions/mean_length': 7965.46875, 'completions/min_length': 967.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7173.9833984375, 'completions/min_terminated_length': 967.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.37716054916381836, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.0188672486692667, 'sampling/sampling_logp_difference/max': 4.459580898284912, 'sampling/importance_sampling_ratio/min': 0.011567210778594017, 'sampling/importance_sampling_ratio/mean': 0.9999650716781616, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.365429721379769e-05, 'epoch': 0.85}
+
+ 91%|█████████ | 928/1024 [42:25:37<4:45:51, 178.66s/it][AINFO 12-02 13:56:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:56:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:56:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 13:56:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 91%|█████████ | 929/1024 [42:28:45<4:47:24, 181.52s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002219598973169923, 'learning_rate': 1e-05, 'num_tokens': 855613989.0, 'completions/mean_length': 8517.6640625, 'completions/min_length': 653.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8197.8935546875, 'completions/min_terminated_length': 653.0, 'completions/max_terminated_length': 15296.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02053230255842209, 'sampling/sampling_logp_difference/max': 2.805156707763672, 'sampling/importance_sampling_ratio/min': 0.06049728766083717, 'sampling/importance_sampling_ratio/mean': 1.0000061988830566, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.188748443491932e-05, 'epoch': 0.85}
+
+ 91%|█████████ | 929/1024 [42:28:45<4:47:24, 181.52s/it][AINFO 12-02 14:00:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:00:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:00:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:00:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 91%|█████████ | 930/1024 [42:31:29<4:36:09, 176.28s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017998367547988892, 'learning_rate': 1e-05, 'num_tokens': 856602050.0, 'completions/mean_length': 7553.9765625, 'completions/min_length': 960.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7342.05615234375, 'completions/min_terminated_length': 960.0, 'completions/max_terminated_length': 15902.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.22331714630126953, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019885407760739326, 'sampling/sampling_logp_difference/max': 3.0072250366210938, 'sampling/importance_sampling_ratio/min': 0.09956962615251541, 'sampling/importance_sampling_ratio/mean': 1.000013828277588, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.166733538113476e-05, 'epoch': 0.86}
+
+ 91%|█████████ | 930/1024 [42:31:29<4:36:09, 176.28s/it][AINFO 12-02 14:02:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:02:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:02:46 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:02:46 [block_pool.py:292] Successfully reset prefix cache
+
+ 91%|█████████ | 931/1024 [42:34:26<4:33:17, 176.32s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002455727197229862, 'learning_rate': 1e-05, 'num_tokens': 857629016.0, 'completions/mean_length': 7880.671875, 'completions/min_length': 749.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7813.71630859375, 'completions/min_terminated_length': 749.0, 'completions/max_terminated_length': 15948.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.31064465641975403, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021051237359642982, 'sampling/sampling_logp_difference/max': 1.9979853630065918, 'sampling/importance_sampling_ratio/min': 0.13560821115970612, 'sampling/importance_sampling_ratio/mean': 1.0000324249267578, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.354110860229412e-05, 'epoch': 0.86}
+
+ 91%|█████████ | 931/1024 [42:34:26<4:33:17, 176.32s/it][AINFO 12-02 14:05:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:05:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:05:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:05:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 91%|█████████ | 932/1024 [42:37:26<4:32:11, 177.52s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013124459655955434, 'learning_rate': 1e-05, 'num_tokens': 858598203.0, 'completions/mean_length': 7406.3359375, 'completions/min_length': 1114.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 6303.81591796875, 'completions/min_terminated_length': 1114.0, 'completions/max_terminated_length': 15488.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019717112183570862, 'sampling/sampling_logp_difference/max': 2.0183801651000977, 'sampling/importance_sampling_ratio/min': 0.13287052512168884, 'sampling/importance_sampling_ratio/mean': 1.0000731945037842, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.6777360972118913e-05, 'epoch': 0.86}
+
+ 91%|█████████ | 932/1024 [42:37:26<4:32:11, 177.52s/it][AINFO 12-02 14:08:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:08:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:08:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:08:43 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 91%|█████████ | 933/1024 [42:40:46<4:39:34, 184.33s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018669140990823507, 'learning_rate': 1e-05, 'num_tokens': 859838551.0, 'completions/mean_length': 9526.84375, 'completions/min_length': 1184.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 8882.154296875, 'completions/min_terminated_length': 1184.0, 'completions/max_terminated_length': 15713.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.1344047486782074, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.022804107517004013, 'sampling/sampling_logp_difference/max': 2.106933116912842, 'sampling/importance_sampling_ratio/min': 0.1216103583574295, 'sampling/importance_sampling_ratio/mean': 1.0000280141830444, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.133843074465403e-05, 'epoch': 0.86}
+
+ 91%|█████████ | 933/1024 [42:40:46<4:39:34, 184.33s/it][AINFO 12-02 14:12:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:12:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:12:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:12:03 [block_pool.py:292] Successfully reset prefix cache
+
+ 91%|█████████ | 934/1024 [42:43:54<4:38:05, 185.39s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013990486040711403, 'learning_rate': 1e-05, 'num_tokens': 860963755.0, 'completions/mean_length': 8655.96875, 'completions/min_length': 1262.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 8071.49609375, 'completions/min_terminated_length': 1262.0, 'completions/max_terminated_length': 15941.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018983392044901848, 'sampling/sampling_logp_difference/max': 3.035477638244629, 'sampling/importance_sampling_ratio/min': 0.048051707446575165, 'sampling/importance_sampling_ratio/mean': 0.9999777674674988, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.8567680184751225e-05, 'epoch': 0.86}
+
+ 91%|█████████ | 934/1024 [42:43:54<4:38:05, 185.39s/it][AINFO 12-02 14:15:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:15:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:15:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:15:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 91%|█████████▏| 935/1024 [42:46:24<4:19:11, 174.73s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0012264460092410445, 'learning_rate': 1e-05, 'num_tokens': 861812924.0, 'completions/mean_length': 6497.8203125, 'completions/min_length': 689.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6260.55224609375, 'completions/min_terminated_length': 689.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.22962789237499237, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019223492592573166, 'sampling/sampling_logp_difference/max': 1.7066330909729004, 'sampling/importance_sampling_ratio/min': 0.18147577345371246, 'sampling/importance_sampling_ratio/mean': 1.0000536441802979, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.485144993144786e-05, 'epoch': 0.86}
+
+ 91%|█████████▏| 935/1024 [42:46:24<4:19:11, 174.73s/it][AINFO 12-02 14:17:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:17:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:17:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:17:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 91%|█████████▏| 936/1024 [42:49:05<4:10:08, 170.55s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001141569809988141, 'learning_rate': 1e-05, 'num_tokens': 862835717.0, 'completions/mean_length': 7808.1328125, 'completions/min_length': 1168.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7531.49169921875, 'completions/min_terminated_length': 1168.0, 'completions/max_terminated_length': 14325.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.2012200653553009, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02082858607172966, 'sampling/sampling_logp_difference/max': 3.1964709758758545, 'sampling/importance_sampling_ratio/min': 0.040906306356191635, 'sampling/importance_sampling_ratio/mean': 1.000068187713623, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.09011890635702e-05, 'epoch': 0.86}
+
+ 91%|█████████▏| 936/1024 [42:49:05<4:10:08, 170.55s/it][AINFO 12-02 14:20:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:20:22 [block_pool.py:292] Successfully reset prefix cache
+
+ 92%|█████████▏| 937/1024 [42:51:58<4:08:30, 171.39s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002352684736251831, 'learning_rate': 1e-05, 'num_tokens': 863859125.0, 'completions/mean_length': 7863.0, 'completions/min_length': 1364.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7727.74658203125, 'completions/min_terminated_length': 1364.0, 'completions/max_terminated_length': 14742.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.34822866320610046, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02066079154610634, 'sampling/sampling_logp_difference/max': 2.482855796813965, 'sampling/importance_sampling_ratio/min': 0.08350440859794617, 'sampling/importance_sampling_ratio/mean': 1.0000097751617432, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.6727285254964954e-05, 'epoch': 0.86}
+
+ 92%|█████████▏| 937/1024 [42:51:58<4:08:30, 171.39s/it][AINFO 12-02 14:23:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:15 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:23:15 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 92%|█████████▏| 938/1024 [42:54:54<4:07:29, 172.66s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0025316625833511353, 'learning_rate': 1e-05, 'num_tokens': 864929612.0, 'completions/mean_length': 8203.3671875, 'completions/min_length': 1065.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7801.04052734375, 'completions/min_terminated_length': 1065.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02198244445025921, 'sampling/sampling_logp_difference/max': 1.9848909378051758, 'sampling/importance_sampling_ratio/min': 0.13739559054374695, 'sampling/importance_sampling_ratio/mean': 1.000104546546936, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.5296670780080603e-05, 'epoch': 0.86}
+
+ 92%|█████████▏| 938/1024 [42:54:54<4:07:29, 172.66s/it][AINFO 12-02 14:26:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:26:11 [block_pool.py:292] Successfully reset prefix cache
+
+ 92%|█████████▏| 939/1024 [42:58:01<4:10:42, 176.98s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001855595619417727, 'learning_rate': 1e-05, 'num_tokens': 866099693.0, 'completions/mean_length': 8980.6328125, 'completions/min_length': 1644.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8741.814453125, 'completions/min_terminated_length': 1644.0, 'completions/max_terminated_length': 16234.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019690170884132385, 'sampling/sampling_logp_difference/max': 4.722038269042969, 'sampling/importance_sampling_ratio/min': 0.008897025138139725, 'sampling/importance_sampling_ratio/mean': 0.9999998807907104, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.174705006538716e-05, 'epoch': 0.86}
+
+ 92%|█████████▏| 939/1024 [42:58:01<4:10:42, 176.98s/it][AINFO 12-02 14:29:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:29:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:29:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:29:18 [block_pool.py:292] Successfully reset prefix cache
+
+ 92%|█████████▏| 940/1024 [43:00:45<4:02:09, 172.97s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001390884630382061, 'learning_rate': 1e-05, 'num_tokens': 867049122.0, 'completions/mean_length': 7264.1015625, 'completions/min_length': 1441.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7192.29150390625, 'completions/min_terminated_length': 1441.0, 'completions/max_terminated_length': 15768.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020992206409573555, 'sampling/sampling_logp_difference/max': 3.9417929649353027, 'sampling/importance_sampling_ratio/min': 0.019413374364376068, 'sampling/importance_sampling_ratio/mean': 1.0000951290130615, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.0965280479431385e-05, 'epoch': 0.86}
+
+ 92%|█████████▏| 940/1024 [43:00:45<4:02:09, 172.97s/it][AINFO 12-02 14:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:32:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:32:01 [block_pool.py:292] Successfully reset prefix cache
+
+ 92%|█████████▏| 941/1024 [43:03:35<3:58:21, 172.30s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019669928587973118, 'learning_rate': 1e-05, 'num_tokens': 868124704.0, 'completions/mean_length': 8250.421875, 'completions/min_length': 1548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 8186.3779296875, 'completions/min_terminated_length': 1548.0, 'completions/max_terminated_length': 15354.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.022221088409423828, 'sampling/sampling_logp_difference/max': 5.717191696166992, 'sampling/importance_sampling_ratio/min': 0.003288934240117669, 'sampling/importance_sampling_ratio/mean': 0.9999855756759644, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.9264208061904355e-05, 'epoch': 0.87}
+
+ 92%|█████████▏| 941/1024 [43:03:35<3:58:21, 172.30s/it][AINFO 12-02 14:34:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:34:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:34:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:34:52 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 92%|█████████▏| 942/1024 [43:06:40<4:00:37, 176.07s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013234260259196162, 'learning_rate': 1e-05, 'num_tokens': 869190733.0, 'completions/mean_length': 8180.7890625, 'completions/min_length': 1323.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7983.91259765625, 'completions/min_terminated_length': 1323.0, 'completions/max_terminated_length': 15798.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021875999867916107, 'sampling/sampling_logp_difference/max': 2.83310866355896, 'sampling/importance_sampling_ratio/min': 0.05882968753576279, 'sampling/importance_sampling_ratio/mean': 1.0000091791152954, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.57539740637003e-05, 'epoch': 0.87}
+
+ 92%|█████████▏| 942/1024 [43:06:40<4:00:37, 176.07s/it][AINFO 12-02 14:37:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:37:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:37:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:37:57 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 92%|█████████▏| 943/1024 [43:09:38<3:58:22, 176.58s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0012899684952571988, 'learning_rate': 1e-05, 'num_tokens': 870170959.0, 'completions/mean_length': 7518.015625, 'completions/min_length': 1126.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7081.9833984375, 'completions/min_terminated_length': 1126.0, 'completions/max_terminated_length': 16186.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.29013246297836304, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020881671458482742, 'sampling/sampling_logp_difference/max': 6.874754428863525, 'sampling/importance_sampling_ratio/min': 0.0010335514089092612, 'sampling/importance_sampling_ratio/mean': 0.9999723434448242, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.155750900736166e-05, 'epoch': 0.87}
+
+ 92%|█████████▏| 943/1024 [43:09:38<3:58:22, 176.58s/it][AINFO 12-02 14:40:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:40:55 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 92%|█████████▏| 944/1024 [43:12:23<3:50:39, 173.00s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002897202270105481, 'learning_rate': 1e-05, 'num_tokens': 871056702.0, 'completions/mean_length': 6766.4296875, 'completions/min_length': 1108.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6375.47119140625, 'completions/min_terminated_length': 1108.0, 'completions/max_terminated_length': 15745.0, 'rewards/accuracy_reward/mean': 0.6328125, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.6328125, 'reward_std': 0.34716784954071045, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020130526274442673, 'sampling/sampling_logp_difference/max': 3.4416847229003906, 'sampling/importance_sampling_ratio/min': 0.03201071172952652, 'sampling/importance_sampling_ratio/mean': 0.9999530911445618, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.1928419705072884e-05, 'epoch': 0.87}
+
+ 92%|█████████▏| 944/1024 [43:12:23<3:50:39, 173.00s/it][AINFO 12-02 14:43:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:43:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:43:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:43:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 92%|█████████▏| 945/1024 [43:15:22<3:50:30, 175.07s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001736831502057612, 'learning_rate': 1e-05, 'num_tokens': 872163027.0, 'completions/mean_length': 8498.1015625, 'completions/min_length': 1723.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8110.27001953125, 'completions/min_terminated_length': 1723.0, 'completions/max_terminated_length': 15766.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2964431941509247, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0214335098862648, 'sampling/sampling_logp_difference/max': 5.299503326416016, 'sampling/importance_sampling_ratio/min': 0.004994073882699013, 'sampling/importance_sampling_ratio/mean': 0.9999889135360718, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.639453315277933e-05, 'epoch': 0.87}
+
+ 92%|█████████▏| 945/1024 [43:15:22<3:50:30, 175.07s/it][AINFO 12-02 14:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:46:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:46:39 [block_pool.py:292] Successfully reset prefix cache
+
+ 92%|█████████▏| 946/1024 [43:18:33<3:53:45, 179.81s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0008036107756197453, 'learning_rate': 1e-05, 'num_tokens': 873335751.0, 'completions/mean_length': 9021.78125, 'completions/min_length': 1514.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8530.966796875, 'completions/min_terminated_length': 1514.0, 'completions/max_terminated_length': 15987.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021093927323818207, 'sampling/sampling_logp_difference/max': 3.126608371734619, 'sampling/importance_sampling_ratio/min': 0.04386632516980171, 'sampling/importance_sampling_ratio/mean': 0.9999247193336487, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0738282362108293e-05, 'epoch': 0.87}
+
+ 92%|█████████▏| 946/1024 [43:18:33<3:53:45, 179.81s/it][AINFO 12-02 14:49:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:49:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 92%|█████████▏| 947/1024 [43:21:35<3:51:31, 180.40s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0014720447361469269, 'learning_rate': 1e-05, 'num_tokens': 874507483.0, 'completions/mean_length': 9008.46875, 'completions/min_length': 1824.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 8174.712890625, 'completions/min_terminated_length': 1824.0, 'completions/max_terminated_length': 16366.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.35901519656181335, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017479699105024338, 'sampling/sampling_logp_difference/max': 1.919478416442871, 'sampling/importance_sampling_ratio/min': 0.1466834545135498, 'sampling/importance_sampling_ratio/mean': 1.0000392198562622, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.997913305895054e-05, 'epoch': 0.87}
+
+ 92%|█████████▏| 947/1024 [43:21:35<3:51:31, 180.40s/it][AINFO 12-02 14:52:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:52:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:52:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:52:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 93%|█████████▎| 948/1024 [43:24:37<3:49:01, 180.81s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001768582034856081, 'learning_rate': 1e-05, 'num_tokens': 875614058.0, 'completions/mean_length': 8501.3046875, 'completions/min_length': 2211.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7760.19677734375, 'completions/min_terminated_length': 2211.0, 'completions/max_terminated_length': 16299.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2948406934738159, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01915239542722702, 'sampling/sampling_logp_difference/max': 4.234213829040527, 'sampling/importance_sampling_ratio/min': 0.01449119858443737, 'sampling/importance_sampling_ratio/mean': 0.9999799132347107, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.832540776078531e-05, 'epoch': 0.87}
+
+ 93%|█████████▎| 948/1024 [43:24:37<3:49:01, 180.81s/it][AINFO 12-02 14:55:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:55:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:55:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:55:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 93%|█████████▎| 949/1024 [43:27:26<3:41:28, 177.18s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015615640440955758, 'learning_rate': 1e-05, 'num_tokens': 876685153.0, 'completions/mean_length': 8216.7421875, 'completions/min_length': 2060.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8087.103515625, 'completions/min_terminated_length': 2060.0, 'completions/max_terminated_length': 15464.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.23251095414161682, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02277110517024994, 'sampling/sampling_logp_difference/max': 6.236233711242676, 'sampling/importance_sampling_ratio/min': 0.0019572130404412746, 'sampling/importance_sampling_ratio/mean': 1.0000929832458496, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.1813152304494e-05, 'epoch': 0.87}
+
+ 93%|█████████▎| 949/1024 [43:27:26<3:41:28, 177.18s/it][AINFO 12-02 14:58:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:58:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:58:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 14:58:42 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 93%|█████████▎| 950/1024 [43:29:57<3:29:03, 169.50s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018410769989714026, 'learning_rate': 1e-05, 'num_tokens': 877576595.0, 'completions/mean_length': 6797.765625, 'completions/min_length': 1369.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6326.31103515625, 'completions/min_terminated_length': 1369.0, 'completions/max_terminated_length': 14939.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.15650184452533722, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020739436149597168, 'sampling/sampling_logp_difference/max': 2.849907875061035, 'sampling/importance_sampling_ratio/min': 0.05784965306520462, 'sampling/importance_sampling_ratio/mean': 1.0000712871551514, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.422348763455375e-05, 'epoch': 0.87}
+
+ 93%|█████████▎| 950/1024 [43:29:57<3:29:03, 169.50s/it][AINFO 12-02 15:01:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:01:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:01:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:01:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 93%|█████████▎| 951/1024 [43:32:40<3:23:47, 167.50s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0029296462889760733, 'learning_rate': 1e-05, 'num_tokens': 878538692.0, 'completions/mean_length': 7355.1953125, 'completions/min_length': 1797.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7138.50439453125, 'completions/min_terminated_length': 1797.0, 'completions/max_terminated_length': 16177.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019475776702165604, 'sampling/sampling_logp_difference/max': 4.595555305480957, 'sampling/importance_sampling_ratio/min': 0.010096613317728043, 'sampling/importance_sampling_ratio/mean': 1.0000641345977783, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.892803076472774e-05, 'epoch': 0.87}
+
+ 93%|█████████▎| 951/1024 [43:32:40<3:23:47, 167.50s/it][AINFO 12-02 15:03:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:03:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:03:57 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:03:57 [block_pool.py:292] Successfully reset prefix cache
+
+ 93%|█████████▎| 952/1024 [43:35:43<3:26:34, 172.15s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015323987463489175, 'learning_rate': 1e-05, 'num_tokens': 879675381.0, 'completions/mean_length': 8724.4453125, 'completions/min_length': 942.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8477.3623046875, 'completions/min_terminated_length': 942.0, 'completions/max_terminated_length': 16054.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.24169495701789856, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.022527402266860008, 'sampling/sampling_logp_difference/max': 5.51448392868042, 'sampling/importance_sampling_ratio/min': 0.00402800552546978, 'sampling/importance_sampling_ratio/mean': 1.0000752210617065, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.244620722853142e-05, 'epoch': 0.88}
+
+ 93%|█████████▎| 952/1024 [43:35:43<3:26:34, 172.15s/it][AINFO 12-02 15:07:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:07:00 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 93%|█████████▎| 953/1024 [43:38:43<3:26:28, 174.48s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001611744286492467, 'learning_rate': 1e-05, 'num_tokens': 880761521.0, 'completions/mean_length': 8337.46875, 'completions/min_length': 869.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7871.966796875, 'completions/min_terminated_length': 869.0, 'completions/max_terminated_length': 16308.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.24510988593101501, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020028134807944298, 'sampling/sampling_logp_difference/max': 2.49385929107666, 'sampling/importance_sampling_ratio/min': 0.08259060978889465, 'sampling/importance_sampling_ratio/mean': 1.00002121925354, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.963901497172628e-05, 'epoch': 0.88}
+
+ 93%|█████████▎| 953/1024 [43:38:43<3:26:28, 174.48s/it][AINFO 12-02 15:10:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:10:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:10:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:10:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 93%|█████████▎| 954/1024 [43:41:55<3:29:32, 179.60s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0029313804116100073, 'learning_rate': 1e-05, 'num_tokens': 881826416.0, 'completions/mean_length': 8130.3671875, 'completions/min_length': 1349.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7506.14306640625, 'completions/min_terminated_length': 1349.0, 'completions/max_terminated_length': 16147.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.34010058641433716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02037106454372406, 'sampling/sampling_logp_difference/max': 4.758142471313477, 'sampling/importance_sampling_ratio/min': 0.008581534959375858, 'sampling/importance_sampling_ratio/mean': 1.00006902217865, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.98658445799083e-05, 'epoch': 0.88}
+
+ 93%|█████████▎| 954/1024 [43:41:55<3:29:32, 179.60s/it][AINFO 12-02 15:13:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:13:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:13:11 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:13:11 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 93%|█████████▎| 955/1024 [43:44:59<3:28:15, 181.10s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0013997050700709224, 'learning_rate': 1e-05, 'num_tokens': 882881250.0, 'completions/mean_length': 8076.703125, 'completions/min_length': 774.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7448.42041015625, 'completions/min_terminated_length': 774.0, 'completions/max_terminated_length': 16098.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.17358636856079102, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020866060629487038, 'sampling/sampling_logp_difference/max': 2.939814329147339, 'sampling/importance_sampling_ratio/min': 0.052875544875860214, 'sampling/importance_sampling_ratio/mean': 1.0000051259994507, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.063800474796153e-05, 'epoch': 0.88}
+
+ 93%|█████████▎| 955/1024 [43:44:59<3:28:15, 181.10s/it][AINFO 12-02 15:16:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:16:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:16:16 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:16:16 [block_pool.py:292] Successfully reset prefix cache
+
+ 93%|█████████▎| 956/1024 [43:47:45<3:20:02, 176.50s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0030774185433983803, 'learning_rate': 1e-05, 'num_tokens': 883816821.0, 'completions/mean_length': 7179.0234375, 'completions/min_length': 1048.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7032.9130859375, 'completions/min_terminated_length': 1048.0, 'completions/max_terminated_length': 15025.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019020898267626762, 'sampling/sampling_logp_difference/max': 1.994575023651123, 'sampling/importance_sampling_ratio/min': 0.13607145845890045, 'sampling/importance_sampling_ratio/mean': 0.9999518394470215, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7497769767469435e-05, 'epoch': 0.88}
+
+ 93%|█████████▎| 956/1024 [43:47:45<3:20:02, 176.50s/it][AINFO 12-02 15:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:19:02 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 93%|█████████▎| 957/1024 [43:50:35<3:14:47, 174.44s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0012122074840590358, 'learning_rate': 1e-05, 'num_tokens': 884824622.0, 'completions/mean_length': 7719.3203125, 'completions/min_length': 912.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7439.814453125, 'completions/min_terminated_length': 912.0, 'completions/max_terminated_length': 15956.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2740417718887329, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019620876759290695, 'sampling/sampling_logp_difference/max': 2.2455625534057617, 'sampling/importance_sampling_ratio/min': 0.10586796700954437, 'sampling/importance_sampling_ratio/mean': 1.0000072717666626, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.304196469704038e-05, 'epoch': 0.88}
+
+ 93%|█████████▎| 957/1024 [43:50:35<3:14:47, 174.44s/it][AINFO 12-02 15:21:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:21:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 94%|█████████▎| 958/1024 [43:53:37<3:14:27, 176.78s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0021361447870731354, 'learning_rate': 1e-05, 'num_tokens': 885969706.0, 'completions/mean_length': 8807.53125, 'completions/min_length': 2021.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8563.12890625, 'completions/min_terminated_length': 2021.0, 'completions/max_terminated_length': 15917.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.36113685369491577, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02016299217939377, 'sampling/sampling_logp_difference/max': 2.9552547931671143, 'sampling/importance_sampling_ratio/min': 0.052065394818782806, 'sampling/importance_sampling_ratio/mean': 0.9999675750732422, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 8.301805382870953e-05, 'epoch': 0.88}
+
+ 94%|█████████▎| 958/1024 [43:53:37<3:14:27, 176.78s/it][AINFO 12-02 15:24:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:24:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:24:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:24:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 94%|█████████▎| 959/1024 [43:56:21<3:07:22, 172.96s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0035164225846529007, 'learning_rate': 1e-05, 'num_tokens': 886890030.0, 'completions/mean_length': 7036.09375, 'completions/min_length': 418.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6656.09716796875, 'completions/min_terminated_length': 418.0, 'completions/max_terminated_length': 16197.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.26143303513526917, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020008673891425133, 'sampling/sampling_logp_difference/max': 2.9260292053222656, 'sampling/importance_sampling_ratio/min': 0.053609490394592285, 'sampling/importance_sampling_ratio/mean': 1.000043272972107, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.059849106852198e-05, 'epoch': 0.88}
+
+ 94%|█████████▎| 959/1024 [43:56:21<3:07:22, 172.96s/it][AINFO 12-02 15:27:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:27:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:27:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:27:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 94%|█████████▍| 960/1024 [43:58:59<2:59:44, 168.52s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0019830227829515934, 'learning_rate': 1e-05, 'num_tokens': 887768271.0, 'completions/mean_length': 6719.0078125, 'completions/min_length': 1257.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6565.595703125, 'completions/min_terminated_length': 1257.0, 'completions/max_terminated_length': 16005.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.24382153153419495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020509829744696617, 'sampling/sampling_logp_difference/max': 3.294623374938965, 'sampling/importance_sampling_ratio/min': 0.0370820090174675, 'sampling/importance_sampling_ratio/mean': 1.0001158714294434, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.5847070396121126e-05, 'epoch': 0.88}
+
+ 94%|█████████▍| 960/1024 [43:58:59<2:59:44, 168.52s/it][AINFO 12-02 15:30:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:22 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:30:23 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 94%|█████████▍| 961/1024 [44:01:47<2:56:39, 168.24s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0015028183115646243, 'learning_rate': 1e-05, 'num_tokens': 888812038.0, 'completions/mean_length': 8000.6796875, 'completions/min_length': 2062.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7934.66943359375, 'completions/min_terminated_length': 2062.0, 'completions/max_terminated_length': 15108.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.26698726415634155, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020712941884994507, 'sampling/sampling_logp_difference/max': 3.762026786804199, 'sampling/importance_sampling_ratio/min': 0.023236596956849098, 'sampling/importance_sampling_ratio/mean': 0.9999144077301025, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.737643439511885e-05, 'epoch': 0.88}
+
+ 94%|█████████▍| 961/1024 [44:01:47<2:56:39, 168.24s/it][AINFO 12-02 15:33:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:03 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:33:03 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 94%|█████████▍| 962/1024 [44:04:07<2:45:13, 159.89s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017050143796950579, 'learning_rate': 1e-05, 'num_tokens': 889654231.0, 'completions/mean_length': 6439.0078125, 'completions/min_length': 1547.0, 'completions/max_length': 15643.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6439.0078125, 'completions/min_terminated_length': 1547.0, 'completions/max_terminated_length': 15643.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.640625, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019540676847100258, 'sampling/sampling_logp_difference/max': 3.702418565750122, 'sampling/importance_sampling_ratio/min': 0.024663804098963737, 'sampling/importance_sampling_ratio/mean': 1.0000178813934326, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.942659554288184e-05, 'epoch': 0.89}
+
+ 94%|█████████▍| 962/1024 [44:04:07<2:45:13, 159.89s/it][AINFO 12-02 15:35:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:35:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:35:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:35:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 94%|█████████▍| 963/1024 [44:06:42<2:41:00, 158.36s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0013341272715479136, 'learning_rate': 1e-05, 'num_tokens': 890513116.0, 'completions/mean_length': 6541.6640625, 'completions/min_length': 1346.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6385.43701171875, 'completions/min_terminated_length': 1346.0, 'completions/max_terminated_length': 15425.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020453786477446556, 'sampling/sampling_logp_difference/max': 2.467019557952881, 'sampling/importance_sampling_ratio/min': 0.17147304117679596, 'sampling/importance_sampling_ratio/mean': 0.9999470114707947, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.9102033775197924e-05, 'epoch': 0.89}
+
+ 94%|█████████▍| 963/1024 [44:06:42<2:41:00, 158.36s/it][AINFO 12-02 15:37:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:37:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:37:58 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:37:58 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 94%|█████████▍| 964/1024 [44:09:38<2:43:40, 163.68s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0022756934631615877, 'learning_rate': 1e-05, 'num_tokens': 891481786.0, 'completions/mean_length': 7338.546875, 'completions/min_length': 1045.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6815.255859375, 'completions/min_terminated_length': 1045.0, 'completions/max_terminated_length': 16063.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.23356688022613525, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01869349181652069, 'sampling/sampling_logp_difference/max': 16.185659408569336, 'sampling/importance_sampling_ratio/min': 9.346680940325314e-08, 'sampling/importance_sampling_ratio/mean': 0.99998939037323, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.913008001494745e-05, 'epoch': 0.89}
+
+ 94%|█████████▍| 964/1024 [44:09:38<2:43:40, 163.68s/it][AINFO 12-02 15:40:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:40:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:40:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:40:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 94%|█████████▍| 965/1024 [44:12:24<2:41:43, 164.47s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0026985728181898594, 'learning_rate': 1e-05, 'num_tokens': 892419180.0, 'completions/mean_length': 7139.390625, 'completions/min_length': 876.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6684.7373046875, 'completions/min_terminated_length': 876.0, 'completions/max_terminated_length': 16286.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.35720276832580566, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019563795998692513, 'sampling/sampling_logp_difference/max': 5.468259811401367, 'sampling/importance_sampling_ratio/min': 0.004218566697090864, 'sampling/importance_sampling_ratio/mean': 1.0000505447387695, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.805827817719546e-05, 'epoch': 0.89}
+
+ 94%|█████████▍| 965/1024 [44:12:24<2:41:43, 164.47s/it][AINFO 12-02 15:43:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:43:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:43:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:43:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 94%|█████████▍| 966/1024 [44:15:27<2:44:23, 170.06s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0021103713661432266, 'learning_rate': 1e-05, 'num_tokens': 893476928.0, 'completions/mean_length': 8109.46875, 'completions/min_length': 1306.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7773.10546875, 'completions/min_terminated_length': 1306.0, 'completions/max_terminated_length': 15602.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019987473264336586, 'sampling/sampling_logp_difference/max': 3.920133590698242, 'sampling/importance_sampling_ratio/min': 0.019838443025946617, 'sampling/importance_sampling_ratio/mean': 1.0000065565109253, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.48686999700476e-05, 'epoch': 0.89}
+
+ 94%|█████████▍| 966/1024 [44:15:27<2:44:23, 170.06s/it][AINFO 12-02 15:46:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:46:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:46:44 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:46:44 [block_pool.py:292] Successfully reset prefix cache
+
+ 94%|█████████▍| 967/1024 [44:18:04<2:37:48, 166.12s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0016288732877001166, 'learning_rate': 1e-05, 'num_tokens': 894351311.0, 'completions/mean_length': 6676.1796875, 'completions/min_length': 1235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6443.1923828125, 'completions/min_terminated_length': 1235.0, 'completions/max_terminated_length': 16109.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.640625, 'reward_std': 0.2806519567966461, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016381673514842987, 'sampling/sampling_logp_difference/max': 1.6929001808166504, 'sampling/importance_sampling_ratio/min': 0.18398517370224, 'sampling/importance_sampling_ratio/mean': 1.0000286102294922, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.605537510040449e-05, 'epoch': 0.89}
+
+ 94%|█████████▍| 967/1024 [44:18:04<2:37:48, 166.12s/it][AINFO 12-02 15:49:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:49:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:49:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:49:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 95%|█████████▍| 968/1024 [44:21:11<2:40:58, 172.47s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002184785669669509, 'learning_rate': 1e-05, 'num_tokens': 895397584.0, 'completions/mean_length': 7969.4453125, 'completions/min_length': 1079.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7409.12548828125, 'completions/min_terminated_length': 1079.0, 'completions/max_terminated_length': 16272.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.35901516675949097, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020078090950846672, 'sampling/sampling_logp_difference/max': 6.219200134277344, 'sampling/importance_sampling_ratio/min': 0.0019908370450139046, 'sampling/importance_sampling_ratio/mean': 1.0000383853912354, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.8187974218526506e-05, 'epoch': 0.89}
+
+ 95%|█████████▍| 968/1024 [44:21:11<2:40:58, 172.47s/it][AINFO 12-02 15:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:52:28 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 95%|█████████▍| 969/1024 [44:24:22<2:43:10, 178.01s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019623669795691967, 'learning_rate': 1e-05, 'num_tokens': 896497839.0, 'completions/mean_length': 8460.9921875, 'completions/min_length': 1228.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7641.37060546875, 'completions/min_terminated_length': 1228.0, 'completions/max_terminated_length': 16113.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3248382806777954, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019685499370098114, 'sampling/sampling_logp_difference/max': 3.961641311645508, 'sampling/importance_sampling_ratio/min': 0.01903185248374939, 'sampling/importance_sampling_ratio/mean': 1.0000288486480713, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.894390519893932e-05, 'epoch': 0.89}
+
+ 95%|█████████▍| 969/1024 [44:24:22<2:43:10, 178.01s/it][AINFO 12-02 15:55:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:39 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:55:39 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 95%|█████████▍| 970/1024 [44:27:36<2:44:19, 182.58s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0018204165389761329, 'learning_rate': 1e-05, 'num_tokens': 897495992.0, 'completions/mean_length': 7645.0703125, 'completions/min_length': 1102.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7139.51220703125, 'completions/min_terminated_length': 1102.0, 'completions/max_terminated_length': 16238.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019901905208826065, 'sampling/sampling_logp_difference/max': 1.8535699844360352, 'sampling/importance_sampling_ratio/min': 0.15667682886123657, 'sampling/importance_sampling_ratio/mean': 1.0000205039978027, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6162576805054414e-05, 'epoch': 0.89}
+
+ 95%|█████████▍| 970/1024 [44:27:36<2:44:19, 182.58s/it][AINFO 12-02 15:58:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 15:58:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 95%|█████████▍| 971/1024 [44:30:24<2:37:31, 178.33s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.0009883403545245528, 'learning_rate': 1e-05, 'num_tokens': 898376438.0, 'completions/mean_length': 6715.921875, 'completions/min_length': 1154.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6562.4609375, 'completions/min_terminated_length': 1154.0, 'completions/max_terminated_length': 16223.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2772369980812073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018388349562883377, 'sampling/sampling_logp_difference/max': 2.4903736114501953, 'sampling/importance_sampling_ratio/min': 0.08287899196147919, 'sampling/importance_sampling_ratio/mean': 0.9999986886978149, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.109971039019001e-05, 'epoch': 0.89}
+
+ 95%|█████████▍| 971/1024 [44:30:24<2:37:31, 178.33s/it][AINFO 12-02 16:01:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:01:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:01:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:01:41 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 95%|█████████▍| 972/1024 [44:33:22<2:34:20, 178.10s/it][A
+                                                        [A{'loss': 0.0001, 'grad_norm': 0.001912579289637506, 'learning_rate': 1e-05, 'num_tokens': 899441645.0, 'completions/mean_length': 8143.6796875, 'completions/min_length': 1386.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7666.966796875, 'completions/min_terminated_length': 1386.0, 'completions/max_terminated_length': 15385.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2709311544895172, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021718673408031464, 'sampling/sampling_logp_difference/max': 3.5309178829193115, 'sampling/importance_sampling_ratio/min': 0.02927802875638008, 'sampling/importance_sampling_ratio/mean': 0.9999736547470093, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.976140775805106e-05, 'epoch': 0.89}
+
+ 95%|█████████▍| 972/1024 [44:33:22<2:34:20, 178.10s/it][AINFO 12-02 16:04:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:04:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:04:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:04:38 [block_pool.py:292] Successfully reset prefix cache
+
+ 95%|█████████▌| 973/1024 [44:36:23<2:32:19, 179.21s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0014706342481076717, 'learning_rate': 1e-05, 'num_tokens': 900438159.0, 'completions/mean_length': 7605.640625, 'completions/min_length': 1303.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7248.79638671875, 'completions/min_terminated_length': 1303.0, 'completions/max_terminated_length': 16142.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.23857943713665009, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020193345844745636, 'sampling/sampling_logp_difference/max': 1.7302441596984863, 'sampling/importance_sampling_ratio/min': 0.17724111676216125, 'sampling/importance_sampling_ratio/mean': 1.0000338554382324, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9483486969184014e-05, 'epoch': 0.9}
+
+ 95%|█████████▌| 973/1024 [44:36:23<2:32:19, 179.21s/it][AINFO 12-02 16:07:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:07:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:07:40 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:07:40 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 95%|█████████▌| 974/1024 [44:39:26<2:30:13, 180.26s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0010454566217958927, 'learning_rate': 1e-05, 'num_tokens': 901393278.0, 'completions/mean_length': 7301.4921875, 'completions/min_length': 800.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6776.0576171875, 'completions/min_terminated_length': 800.0, 'completions/max_terminated_length': 15260.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.19780021905899048, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01950974389910698, 'sampling/sampling_logp_difference/max': 1.6778583526611328, 'sampling/importance_sampling_ratio/min': 0.18677355349063873, 'sampling/importance_sampling_ratio/mean': 0.9999399185180664, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.808731028380862e-05, 'epoch': 0.9}
+
+ 95%|█████████▌| 974/1024 [44:39:26<2:30:13, 180.26s/it][AINFO 12-02 16:10:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:10:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:10:43 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:10:43 [block_pool.py:292] Successfully reset prefix cache
+
+ 95%|█████████▌| 975/1024 [44:42:35<2:29:25, 182.97s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019710524939000607, 'learning_rate': 1e-05, 'num_tokens': 902457149.0, 'completions/mean_length': 8159.3671875, 'completions/min_length': 901.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7683.5615234375, 'completions/min_terminated_length': 901.0, 'completions/max_terminated_length': 16095.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2782978415489197, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019966114312410355, 'sampling/sampling_logp_difference/max': 3.7196807861328125, 'sampling/importance_sampling_ratio/min': 0.024241704493761063, 'sampling/importance_sampling_ratio/mean': 0.9998645782470703, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.773877228421043e-05, 'epoch': 0.9}
+
+ 95%|█████████▌| 975/1024 [44:42:35<2:29:25, 182.97s/it][AINFO 12-02 16:13:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:13:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:13:52 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:13:52 [block_pool.py:292] Successfully reset prefix cache
+
+ 95%|█████████▌| 976/1024 [44:45:20<2:21:58, 177.47s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001469166367314756, 'learning_rate': 1e-05, 'num_tokens': 903327146.0, 'completions/mean_length': 6631.3515625, 'completions/min_length': 1399.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6554.55908203125, 'completions/min_terminated_length': 1399.0, 'completions/max_terminated_length': 14686.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.18990950286388397, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019503455609083176, 'sampling/sampling_logp_difference/max': 1.7414004802703857, 'sampling/importance_sampling_ratio/min': 0.17527475953102112, 'sampling/importance_sampling_ratio/mean': 0.9999025464057922, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.514324475872854e-05, 'epoch': 0.9}
+
+ 95%|█████████▌| 976/1024 [44:45:20<2:21:58, 177.47s/it][AINFO 12-02 16:16:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:16:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:16:37 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:16:37 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 95%|█████████▌| 977/1024 [44:48:01<2:15:14, 172.65s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0009152962593361735, 'learning_rate': 1e-05, 'num_tokens': 904268907.0, 'completions/mean_length': 7219.3828125, 'completions/min_length': 1139.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7073.9130859375, 'completions/min_terminated_length': 1139.0, 'completions/max_terminated_length': 15232.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.14966705441474915, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020433466881513596, 'sampling/sampling_logp_difference/max': 8.539098739624023, 'sampling/importance_sampling_ratio/min': 0.00019566653645597398, 'sampling/importance_sampling_ratio/mean': 0.9999702572822571, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9038724392194126e-05, 'epoch': 0.9}
+
+ 95%|█████████▌| 977/1024 [44:48:01<2:15:14, 172.65s/it][AINFO 12-02 16:19:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:18 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:19:18 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 96%|█████████▌| 978/1024 [44:51:16<2:17:24, 179.23s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019957241602241993, 'learning_rate': 1e-05, 'num_tokens': 905340439.0, 'completions/mean_length': 8212.53125, 'completions/min_length': 1380.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7739.80126953125, 'completions/min_terminated_length': 1380.0, 'completions/max_terminated_length': 16230.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2767002582550049, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020036667585372925, 'sampling/sampling_logp_difference/max': 2.665867805480957, 'sampling/importance_sampling_ratio/min': 0.06953898072242737, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.9429157570557436e-05, 'epoch': 0.9}
+
+ 96%|█████████▌| 978/1024 [44:51:16<2:17:24, 179.23s/it][AINFO 12-02 16:22:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:22:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:22:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:22:33 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 96%|█████████▌| 979/1024 [44:53:14<2:00:38, 160.86s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.001903178752399981, 'learning_rate': 1e-05, 'num_tokens': 906028024.0, 'completions/mean_length': 5211.8203125, 'completions/min_length': 1027.0, 'completions/max_length': 14376.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5211.8203125, 'completions/min_terminated_length': 1027.0, 'completions/max_terminated_length': 14376.0, 'rewards/accuracy_reward/mean': 0.703125, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.703125, 'reward_std': 0.2790592312812805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01786947436630726, 'sampling/sampling_logp_difference/max': 1.750545859336853, 'sampling/importance_sampling_ratio/min': 0.17367911338806152, 'sampling/importance_sampling_ratio/mean': 1.000001311302185, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.0393862605014874e-05, 'epoch': 0.9}
+
+ 96%|█████████▌| 979/1024 [44:53:14<2:00:38, 160.86s/it][AINFO 12-02 16:24:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:24:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:24:31 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:24:31 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 96%|█████████▌| 980/1024 [44:56:34<2:06:34, 172.61s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.0016716814134269953, 'learning_rate': 1e-05, 'num_tokens': 907147981.0, 'completions/mean_length': 8598.0390625, 'completions/min_length': 327.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7792.5947265625, 'completions/min_terminated_length': 327.0, 'completions/max_terminated_length': 15928.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.192268505692482, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020783785730600357, 'sampling/sampling_logp_difference/max': 3.2271499633789062, 'sampling/importance_sampling_ratio/min': 0.03967040032148361, 'sampling/importance_sampling_ratio/mean': 0.999955415725708, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.671987690267997e-05, 'epoch': 0.9}
+
+ 96%|█████████▌| 980/1024 [44:56:34<2:06:34, 172.61s/it][AINFO 12-02 16:27:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:27:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:27:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:27:51 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 96%|█████████▌| 981/1024 [44:59:26<2:03:29, 172.32s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016602674731984735, 'learning_rate': 1e-05, 'num_tokens': 908123427.0, 'completions/mean_length': 7463.546875, 'completions/min_length': 711.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7175.7900390625, 'completions/min_terminated_length': 711.0, 'completions/max_terminated_length': 16256.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020228194072842598, 'sampling/sampling_logp_difference/max': 2.372500419616699, 'sampling/importance_sampling_ratio/min': 0.09324727952480316, 'sampling/importance_sampling_ratio/mean': 0.9999887347221375, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.8982249748187314e-05, 'epoch': 0.9}
+
+ 96%|█████████▌| 981/1024 [44:59:26<2:03:29, 172.32s/it][AINFO 12-02 16:30:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:42 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:30:42 [block_pool.py:292] Successfully reset prefix cache
+
+ 96%|█████████▌| 982/1024 [45:02:25<2:01:58, 174.26s/it][A
+                                                        [A{'loss': -0.0, 'grad_norm': 0.002113994909450412, 'learning_rate': 1e-05, 'num_tokens': 909204336.0, 'completions/mean_length': 8294.9140625, 'completions/min_length': 1070.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7897.08984375, 'completions/min_terminated_length': 1070.0, 'completions/max_terminated_length': 16328.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.24381661415100098, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020874924957752228, 'sampling/sampling_logp_difference/max': 8.77961540222168, 'sampling/importance_sampling_ratio/min': 0.0001538372307550162, 'sampling/importance_sampling_ratio/mean': 0.9999390840530396, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.876467437497922e-05, 'epoch': 0.9}
+
+ 96%|█████████▌| 982/1024 [45:02:25<2:01:58, 174.26s/it][AINFO 12-02 16:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:33:41 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:33:41 [block_pool.py:292] Successfully reset prefix cache
+
+ 96%|█████████▌| 983/1024 [45:05:03<1:55:56, 169.66s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0017960413824766874, 'learning_rate': 1e-05, 'num_tokens': 910165938.0, 'completions/mean_length': 7365.078125, 'completions/min_length': 1696.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7074.14501953125, 'completions/min_terminated_length': 1696.0, 'completions/max_terminated_length': 15629.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.20517179369926453, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02127302996814251, 'sampling/sampling_logp_difference/max': 2.7232913970947266, 'sampling/importance_sampling_ratio/min': 0.06565829366445541, 'sampling/importance_sampling_ratio/mean': 1.0000324249267578, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.417091753339264e-05, 'epoch': 0.9}
+
+ 96%|█████████▌| 983/1024 [45:05:03<1:55:56, 169.66s/it][AINFO 12-02 16:36:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:36:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:36:20 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:36:20 [block_pool.py:292] Successfully reset prefix cache
+
+ 96%|█████████▌| 984/1024 [45:08:22<1:58:49, 178.24s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016361246816813946, 'learning_rate': 1e-05, 'num_tokens': 911262019.0, 'completions/mean_length': 8413.4453125, 'completions/min_length': 1404.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7810.630859375, 'completions/min_terminated_length': 1404.0, 'completions/max_terminated_length': 16179.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02066284976899624, 'sampling/sampling_logp_difference/max': 3.769989490509033, 'sampling/importance_sampling_ratio/min': 0.023052306845784187, 'sampling/importance_sampling_ratio/mean': 0.9999500513076782, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.729559541123308e-05, 'epoch': 0.91}
+
+ 96%|█████████▌| 984/1024 [45:08:22<1:58:49, 178.24s/it][AINFO 12-02 16:39:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:39:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:39:38 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:39:38 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 96%|█████████▌| 985/1024 [45:11:32<1:58:07, 181.73s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0022313296794891357, 'learning_rate': 1e-05, 'num_tokens': 912243391.0, 'completions/mean_length': 7516.15625, 'completions/min_length': 1813.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6845.4794921875, 'completions/min_terminated_length': 1813.0, 'completions/max_terminated_length': 15389.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2556639611721039, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019472699612379074, 'sampling/sampling_logp_difference/max': 5.939077377319336, 'sampling/importance_sampling_ratio/min': 0.0026344594080001116, 'sampling/importance_sampling_ratio/mean': 1.000022292137146, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.378385844778677e-05, 'epoch': 0.91}
+
+ 96%|█████████▌| 985/1024 [45:11:32<1:58:07, 181.73s/it][AINFO 12-02 16:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:42:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:42:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 96%|█████████▋| 986/1024 [45:14:18<1:52:15, 177.25s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015504715265706182, 'learning_rate': 1e-05, 'num_tokens': 913117291.0, 'completions/mean_length': 6708.03125, 'completions/min_length': 901.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6475.80810546875, 'completions/min_terminated_length': 901.0, 'completions/max_terminated_length': 16266.0, 'rewards/accuracy_reward/mean': 0.6015625, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.6015625, 'reward_std': 0.23646269738674164, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019405167549848557, 'sampling/sampling_logp_difference/max': 1.9520618915557861, 'sampling/importance_sampling_ratio/min': 0.27674365043640137, 'sampling/importance_sampling_ratio/mean': 1.0000698566436768, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.114544194384507e-05, 'epoch': 0.91}
+
+ 96%|█████████▋| 986/1024 [45:14:18<1:52:15, 177.25s/it][AINFO 12-02 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:45:35 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 96%|█████████▋| 987/1024 [45:17:11<1:48:30, 175.95s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002042568987235427, 'learning_rate': 1e-05, 'num_tokens': 914075366.0, 'completions/mean_length': 7333.2734375, 'completions/min_length': 1610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6809.67724609375, 'completions/min_terminated_length': 1610.0, 'completions/max_terminated_length': 15852.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.19568344950675964, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020896106958389282, 'sampling/sampling_logp_difference/max': 4.476626396179199, 'sampling/importance_sampling_ratio/min': 0.011371712200343609, 'sampling/importance_sampling_ratio/mean': 0.9999376535415649, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.7892625780623348e-05, 'epoch': 0.91}
+
+ 96%|█████████▋| 987/1024 [45:17:11<1:48:30, 175.95s/it][AINFO 12-02 16:48:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:48:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:48:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:48:28 [block_pool.py:292] Successfully reset prefix cache
+
+ 96%|█████████▋| 988/1024 [45:20:17<1:47:15, 178.78s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.002333950251340866, 'learning_rate': 1e-05, 'num_tokens': 915214612.0, 'completions/mean_length': 8767.171875, 'completions/min_length': 805.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 8121.67822265625, 'completions/min_terminated_length': 805.0, 'completions/max_terminated_length': 16114.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3185402750968933, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01999882236123085, 'sampling/sampling_logp_difference/max': 2.5206830501556396, 'sampling/importance_sampling_ratio/min': 0.08040466904640198, 'sampling/importance_sampling_ratio/mean': 0.9999717473983765, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 9.691916602605488e-05, 'epoch': 0.91}
+
+ 96%|█████████▋| 988/1024 [45:20:17<1:47:15, 178.78s/it][AINFO 12-02 16:51:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:51:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:51:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:51:33 [block_pool.py:292] Successfully reset prefix cache
+
+ 97%|█████████▋| 989/1024 [45:22:53<1:40:26, 172.19s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019700899720191956, 'learning_rate': 1e-05, 'num_tokens': 916077756.0, 'completions/mean_length': 6590.75, 'completions/min_length': 747.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6355.71240234375, 'completions/min_terminated_length': 747.0, 'completions/max_terminated_length': 14714.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02064850926399231, 'sampling/sampling_logp_difference/max': 2.0908496379852295, 'sampling/importance_sampling_ratio/min': 0.12358208745718002, 'sampling/importance_sampling_ratio/mean': 0.9999971985816956, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.033885358012412e-05, 'epoch': 0.91}
+
+ 97%|█████████▋| 989/1024 [45:22:53<1:40:26, 172.19s/it][AINFO 12-02 16:54:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:54:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:54:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:54:10 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 97%|█████████▋| 990/1024 [45:25:36<1:35:55, 169.27s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019744476303458214, 'learning_rate': 1e-05, 'num_tokens': 916929407.0, 'completions/mean_length': 6522.4609375, 'completions/min_length': 1118.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6121.5849609375, 'completions/min_terminated_length': 1118.0, 'completions/max_terminated_length': 16040.0, 'rewards/accuracy_reward/mean': 0.6796875, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.6796875, 'reward_std': 0.17123225331306458, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.019773945212364197, 'sampling/sampling_logp_difference/max': 3.8414759635925293, 'sampling/importance_sampling_ratio/min': 0.02146190032362938, 'sampling/importance_sampling_ratio/mean': 0.9999533295631409, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.1755995021521812e-05, 'epoch': 0.91}
+
+ 97%|█████████▋| 990/1024 [45:25:36<1:35:55, 169.27s/it][AINFO 12-02 16:56:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:56:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:56:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:56:53 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 97%|█████████▋| 991/1024 [45:28:34<1:34:30, 171.83s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0016266672173514962, 'learning_rate': 1e-05, 'num_tokens': 918071281.0, 'completions/mean_length': 8761.890625, 'completions/min_length': 997.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 8115.94921875, 'completions/min_terminated_length': 997.0, 'completions/max_terminated_length': 15917.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2790592312812805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020015928894281387, 'sampling/sampling_logp_difference/max': 1.994290828704834, 'sampling/importance_sampling_ratio/min': 0.13611014187335968, 'sampling/importance_sampling_ratio/mean': 1.0000755786895752, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.507892253968748e-05, 'epoch': 0.91}
+
+ 97%|█████████▋| 991/1024 [45:28:34<1:34:30, 171.83s/it][AINFO 12-02 16:59:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:59:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:59:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 16:59:50 [block_pool.py:292] Successfully reset prefix cache
+
+ 97%|█████████▋| 992/1024 [45:30:57<1:27:07, 163.36s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0015871055657044053, 'learning_rate': 1e-05, 'num_tokens': 918924285.0, 'completions/mean_length': 6519.71875, 'completions/min_length': 799.0, 'completions/max_length': 14437.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6519.71875, 'completions/min_terminated_length': 799.0, 'completions/max_terminated_length': 14437.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.24435831606388092, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020145665854215622, 'sampling/sampling_logp_difference/max': 5.772953033447266, 'sampling/importance_sampling_ratio/min': 0.00311055826023221, 'sampling/importance_sampling_ratio/mean': 0.9999362230300903, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.507529411057476e-05, 'epoch': 0.91}
+
+ 97%|█████████▋| 992/1024 [45:30:57<1:27:07, 163.36s/it][AINFO 12-02 17:02:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:02:14 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 97%|█████████▋| 993/1024 [45:33:49<1:25:44, 165.95s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0008862537797540426, 'learning_rate': 1e-05, 'num_tokens': 919794473.0, 'completions/mean_length': 6635.40625, 'completions/min_length': 884.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6320.93505859375, 'completions/min_terminated_length': 884.0, 'completions/max_terminated_length': 15575.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2619747221469879, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020844481885433197, 'sampling/sampling_logp_difference/max': 10.60013198852539, 'sampling/importance_sampling_ratio/min': 2.4912720618885942e-05, 'sampling/importance_sampling_ratio/mean': 0.9999967813491821, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.420454954219167e-05, 'epoch': 0.91}
+
+ 97%|█████████▋| 993/1024 [45:33:49<1:25:44, 165.95s/it][AINFO 12-02 17:05:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:05:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 97%|█████████▋| 994/1024 [45:36:13<1:19:38, 159.30s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0026396396569907665, 'learning_rate': 1e-05, 'num_tokens': 920600345.0, 'completions/mean_length': 6152.6875, 'completions/min_length': 901.0, 'completions/max_length': 13773.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6152.6875, 'completions/min_terminated_length': 901.0, 'completions/max_terminated_length': 13773.0, 'rewards/accuracy_reward/mean': 0.6640625, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.6640625, 'reward_std': 0.36667346954345703, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019302403554320335, 'sampling/sampling_logp_difference/max': 7.552577495574951, 'sampling/importance_sampling_ratio/min': 0.0005247558001428843, 'sampling/importance_sampling_ratio/mean': 0.9999976754188538, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.528670201078057e-05, 'epoch': 0.91}
+
+ 97%|█████████▋| 994/1024 [45:36:13<1:19:38, 159.30s/it][AINFO 12-02 17:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:07:30 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:07:30 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 97%|█████████▋| 995/1024 [45:39:04<1:18:43, 162.90s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0014840757939964533, 'learning_rate': 1e-05, 'num_tokens': 921610053.0, 'completions/mean_length': 7739.40625, 'completions/min_length': 1549.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7671.33837890625, 'completions/min_terminated_length': 1549.0, 'completions/max_terminated_length': 16197.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.21382391452789307, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021080249920487404, 'sampling/sampling_logp_difference/max': 5.287613868713379, 'sampling/importance_sampling_ratio/min': 0.005053804721683264, 'sampling/importance_sampling_ratio/mean': 1.0000180006027222, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.034320886603382e-05, 'epoch': 0.92}
+
+ 97%|█████████▋| 995/1024 [45:39:04<1:18:43, 162.90s/it][AINFO 12-02 17:10:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:10:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:10:21 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:10:21 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 97%|█████████▋| 996/1024 [45:42:07<1:18:50, 168.93s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0025676132645457983, 'learning_rate': 1e-05, 'num_tokens': 922652294.0, 'completions/mean_length': 7977.4453125, 'completions/min_length': 1435.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7491.115234375, 'completions/min_terminated_length': 1435.0, 'completions/max_terminated_length': 16340.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.3158867657184601, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01991220936179161, 'sampling/sampling_logp_difference/max': 9.748065948486328, 'sampling/importance_sampling_ratio/min': 5.84075169172138e-05, 'sampling/importance_sampling_ratio/mean': 0.9999879598617554, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.586993254131812e-05, 'epoch': 0.92}
+
+ 97%|█████████▋| 996/1024 [45:42:07<1:18:50, 168.93s/it][AINFO 12-02 17:13:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:13:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:13:24 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:13:24 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 97%|█████████▋| 997/1024 [45:44:51<1:15:15, 167.24s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0027864493895322084, 'learning_rate': 1e-05, 'num_tokens': 923640559.0, 'completions/mean_length': 7542.6953125, 'completions/min_length': 846.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7257.49169921875, 'completions/min_terminated_length': 846.0, 'completions/max_terminated_length': 14908.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.250127375125885, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020853929221630096, 'sampling/sampling_logp_difference/max': 5.713392734527588, 'sampling/importance_sampling_ratio/min': 0.003301452612504363, 'sampling/importance_sampling_ratio/mean': 1.0000183582305908, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.417803211254068e-05, 'epoch': 0.92}
+
+ 97%|█████████▋| 997/1024 [45:44:51<1:15:15, 167.24s/it][AINFO 12-02 17:16:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:16:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:16:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:16:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 97%|█████████▋| 998/1024 [45:47:33<1:11:48, 165.70s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.0019515904132276773, 'learning_rate': 1e-05, 'num_tokens': 924511183.0, 'completions/mean_length': 6640.8125, 'completions/min_length': 773.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6564.09423828125, 'completions/min_terminated_length': 773.0, 'completions/max_terminated_length': 15388.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2093530297279358, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019868293777108192, 'sampling/sampling_logp_difference/max': 2.0961050987243652, 'sampling/importance_sampling_ratio/min': 0.12293431162834167, 'sampling/importance_sampling_ratio/mean': 1.0000251531600952, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.7433540001075016e-05, 'epoch': 0.92}
+
+ 97%|█████████▋| 998/1024 [45:47:33<1:11:48, 165.70s/it][AINFO 12-02 17:18:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:18:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:18:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:18:49 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 98%|█████████▊| 999/1024 [45:50:50<1:12:57, 175.09s/it][A
+                                                        [A{'loss': 0.0, 'grad_norm': 0.001988386968150735, 'learning_rate': 1e-05, 'num_tokens': 925763576.0, 'completions/mean_length': 9636.8203125, 'completions/min_length': 1651.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 9002.470703125, 'completions/min_terminated_length': 1651.0, 'completions/max_terminated_length': 16066.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.27724191546440125, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02132497914135456, 'sampling/sampling_logp_difference/max': 5.212202548980713, 'sampling/importance_sampling_ratio/min': 0.005449657328426838, 'sampling/importance_sampling_ratio/mean': 0.9999635815620422, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.769043645566853e-05, 'epoch': 0.92}
+
+ 98%|█████████▊| 999/1024 [45:50:50<1:12:57, 175.09s/it][AINFO 12-02 17:22:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:22:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:22:06 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:22:06 [block_pool.py:292] Successfully reset prefix cache
+
+ 98%|█████████▊| 1000/1024 [45:53:55<1:11:17, 178.22s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.003170250216498971, 'learning_rate': 1e-05, 'num_tokens': 926859343.0, 'completions/mean_length': 8401.3046875, 'completions/min_length': 717.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7939.49560546875, 'completions/min_terminated_length': 717.0, 'completions/max_terminated_length': 16319.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2732901871204376, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019560299813747406, 'sampling/sampling_logp_difference/max': 2.3597278594970703, 'sampling/importance_sampling_ratio/min': 0.09444592148065567, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.7578363794164034e-05, 'epoch': 0.92}
+
+ 98%|█████████▊| 1000/1024 [45:53:55<1:11:17, 178.22s/it][AINFO 12-02 17:25:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:25:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:25:12 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:25:12 [block_pool.py:292] Successfully reset prefix cache
+
+ 98%|█████████▊| 1001/1024 [45:56:53<1:08:12, 177.94s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0017524081049486995, 'learning_rate': 1e-05, 'num_tokens': 927765029.0, 'completions/mean_length': 6919.359375, 'completions/min_length': 942.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6844.83447265625, 'completions/min_terminated_length': 942.0, 'completions/max_terminated_length': 16328.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2391034960746765, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.022072095423936844, 'sampling/sampling_logp_difference/max': 7.495959281921387, 'sampling/importance_sampling_ratio/min': 0.0005553237278945744, 'sampling/importance_sampling_ratio/mean': 1.0000052452087402, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.082756544856238e-05, 'epoch': 0.92}
+
+ 98%|█████████▊| 1001/1024 [45:56:53<1:08:12, 177.94s/it][AINFO 12-02 17:28:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:09 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:28:09 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 98%|█████████▊| 1002/1024 [45:59:37<1:03:43, 173.78s/it][A
+                                                         [A{'loss': -0.0, 'grad_norm': 0.0012765615247189999, 'learning_rate': 1e-05, 'num_tokens': 928678765.0, 'completions/mean_length': 6978.0, 'completions/min_length': 1025.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6350.93359375, 'completions/min_terminated_length': 1025.0, 'completions/max_terminated_length': 16038.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.27222442626953125, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01898837275803089, 'sampling/sampling_logp_difference/max': 2.3717310428619385, 'sampling/importance_sampling_ratio/min': 0.09331905096769333, 'sampling/importance_sampling_ratio/mean': 0.9999797344207764, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.541331211134093e-05, 'epoch': 0.92}
+
+ 98%|█████████▊| 1002/1024 [45:59:37<1:03:43, 173.78s/it][AINFO 12-02 17:30:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:30:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:30:53 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:30:53 [block_pool.py:292] Successfully reset prefix cache
+
+ 98%|█████████▊| 1003/1024 [46:02:33<1:01:04, 174.50s/it][A
+                                                         [A{'loss': 0.0, 'grad_norm': 0.002102622063830495, 'learning_rate': 1e-05, 'num_tokens': 929633524.0, 'completions/mean_length': 7309.1796875, 'completions/min_length': 1204.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6862.87646484375, 'completions/min_terminated_length': 1204.0, 'completions/max_terminated_length': 15538.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018782395869493484, 'sampling/sampling_logp_difference/max': 4.917522430419922, 'sampling/importance_sampling_ratio/min': 0.007317237090319395, 'sampling/importance_sampling_ratio/mean': 0.9999492764472961, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.7366156195494113e-05, 'epoch': 0.92}
+
+ 98%|█████████▊| 1003/1024 [46:02:33<1:01:04, 174.50s/it][AINFO 12-02 17:33:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:33:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:33:49 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:33:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 98%|█████████▊| 1004/1024 [46:05:38<59:12, 177.63s/it]  [A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002629871480166912, 'learning_rate': 1e-05, 'num_tokens': 930672346.0, 'completions/mean_length': 7958.109375, 'completions/min_length': 1087.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7396.3837890625, 'completions/min_terminated_length': 1087.0, 'completions/max_terminated_length': 16249.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.32878512144088745, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021886982023715973, 'sampling/sampling_logp_difference/max': 2.2638332843780518, 'sampling/importance_sampling_ratio/min': 0.10395123809576035, 'sampling/importance_sampling_ratio/mean': 1.0000083446502686, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.208061495271977e-05, 'epoch': 0.92}
+
+ 98%|█████████▊| 1004/1024 [46:05:38<59:12, 177.63s/it][AINFO 12-02 17:36:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:36:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:36:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:36:54 [block_pool.py:292] Successfully reset prefix cache
+
+ 98%|█████████▊| 1005/1024 [46:08:46<57:14, 180.75s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0018781935796141624, 'learning_rate': 1e-05, 'num_tokens': 931733527.0, 'completions/mean_length': 8138.0390625, 'completions/min_length': 2344.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7872.0400390625, 'completions/min_terminated_length': 2344.0, 'completions/max_terminated_length': 16193.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.23592591285705566, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020889881998300552, 'sampling/sampling_logp_difference/max': 4.278860092163086, 'sampling/importance_sampling_ratio/min': 0.013858450576663017, 'sampling/importance_sampling_ratio/mean': 0.9999618530273438, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.3955027220144984e-05, 'epoch': 0.92}
+
+ 98%|█████████▊| 1005/1024 [46:08:46<57:14, 180.75s/it][AINFO 12-02 17:40:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:40:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:40:02 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:40:02 [block_pool.py:292] Successfully reset prefix cache
+
+ 98%|█████████▊| 1006/1024 [46:11:34<53:05, 176.95s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.003405711380764842, 'learning_rate': 1e-05, 'num_tokens': 932635203.0, 'completions/mean_length': 6891.28125, 'completions/min_length': 915.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6740.603515625, 'completions/min_terminated_length': 915.0, 'completions/max_terminated_length': 15226.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02142455242574215, 'sampling/sampling_logp_difference/max': 4.630235195159912, 'sampling/importance_sampling_ratio/min': 0.009752465412020683, 'sampling/importance_sampling_ratio/mean': 0.9999682903289795, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.024995860163472e-05, 'epoch': 0.93}
+
+ 98%|█████████▊| 1006/1024 [46:11:34<53:05, 176.95s/it][AINFO 12-02 17:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:42:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 98%|█████████▊| 1007/1024 [46:14:53<52:03, 183.71s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0026338701136410236, 'learning_rate': 1e-05, 'num_tokens': 933650655.0, 'completions/mean_length': 7778.59375, 'completions/min_length': 637.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 6888.37939453125, 'completions/min_terminated_length': 637.0, 'completions/max_terminated_length': 16185.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.251193106174469, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018332865089178085, 'sampling/sampling_logp_difference/max': 2.2289390563964844, 'sampling/importance_sampling_ratio/min': 0.10764256864786148, 'sampling/importance_sampling_ratio/mean': 0.9999889135360718, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 5.730707380280364e-05, 'epoch': 0.93}
+
+ 98%|█████████▊| 1007/1024 [46:14:53<52:03, 183.71s/it][AINFO 12-02 17:46:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:46:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:46:10 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:46:10 [block_pool.py:292] Successfully reset prefix cache
+
+ 98%|█████████▊| 1008/1024 [46:17:50<48:24, 181.54s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.00129215931519866, 'learning_rate': 1e-05, 'num_tokens': 934563895.0, 'completions/mean_length': 6985.25, 'completions/min_length': 943.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6759.68017578125, 'completions/min_terminated_length': 943.0, 'completions/max_terminated_length': 15755.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019822310656309128, 'sampling/sampling_logp_difference/max': 3.6018331050872803, 'sampling/importance_sampling_ratio/min': 0.02727368287742138, 'sampling/importance_sampling_ratio/mean': 0.9999982714653015, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.4250546971416043e-05, 'epoch': 0.93}
+
+ 98%|█████████▊| 1008/1024 [46:17:50<48:24, 181.54s/it][AINFO 12-02 17:49:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:07 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:49:07 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 99%|█████████▊| 1009/1024 [46:20:34<44:04, 176.30s/it][A
+                                                       [A{'loss': 0.0001, 'grad_norm': 0.0026081129908561707, 'learning_rate': 1e-05, 'num_tokens': 935585461.0, 'completions/mean_length': 7824.609375, 'completions/min_length': 1495.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7619.1845703125, 'completions/min_terminated_length': 1495.0, 'completions/max_terminated_length': 14950.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.29196253418922424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021068882197141647, 'sampling/sampling_logp_difference/max': 2.189469814300537, 'sampling/importance_sampling_ratio/min': 0.11197610199451447, 'sampling/importance_sampling_ratio/mean': 1.000022530555725, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.531040546382428e-05, 'epoch': 0.93}
+
+ 99%|█████████▊| 1009/1024 [46:20:34<44:04, 176.30s/it][AINFO 12-02 17:51:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:51 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:51:51 [block_pool.py:292] Successfully reset prefix cache
+
+ 99%|█████████▊| 1010/1024 [46:23:37<41:38, 178.44s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0011242982000112534, 'learning_rate': 1e-05, 'num_tokens': 936669588.0, 'completions/mean_length': 8317.6171875, 'completions/min_length': 799.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7989.71533203125, 'completions/min_terminated_length': 799.0, 'completions/max_terminated_length': 16225.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2987973093986511, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019333213567733765, 'sampling/sampling_logp_difference/max': 2.5013062953948975, 'sampling/importance_sampling_ratio/min': 0.08197784423828125, 'sampling/importance_sampling_ratio/mean': 0.9999628663063049, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 7.243399022627273e-05, 'epoch': 0.93}
+
+ 99%|█████████▊| 1010/1024 [46:23:37<41:38, 178.44s/it][AINFO 12-02 17:54:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:54 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:54:54 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 99%|█████████▊| 1011/1024 [46:26:39<38:50, 179.31s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0013771441299468279, 'learning_rate': 1e-05, 'num_tokens': 937711680.0, 'completions/mean_length': 8000.09375, 'completions/min_length': 881.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7798.88037109375, 'completions/min_terminated_length': 881.0, 'completions/max_terminated_length': 16351.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.34586966037750244, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02068420872092247, 'sampling/sampling_logp_difference/max': 1.6865003108978271, 'sampling/importance_sampling_ratio/min': 0.1851664036512375, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.63293037632684e-05, 'epoch': 0.93}
+
+ 99%|█████████▊| 1011/1024 [46:26:39<38:50, 179.31s/it][AINFO 12-02 17:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:55 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 17:57:55 [block_pool.py:292] Successfully reset prefix cache
+
+ 99%|█████████▉| 1012/1024 [46:29:51<36:38, 183.18s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0022698300890624523, 'learning_rate': 1e-05, 'num_tokens': 938837041.0, 'completions/mean_length': 8638.3828125, 'completions/min_length': 1391.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7910.16259765625, 'completions/min_terminated_length': 1391.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.265913724899292, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020748887211084366, 'sampling/sampling_logp_difference/max': 1.8259849548339844, 'sampling/importance_sampling_ratio/min': 0.1610589325428009, 'sampling/importance_sampling_ratio/mean': 1.000084638595581, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.46029551110405e-05, 'epoch': 0.93}
+
+ 99%|█████████▉| 1012/1024 [46:29:51<36:38, 183.18s/it][AINFO 12-02 18:01:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:01:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:01:08 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:01:08 [block_pool.py:292] Successfully reset prefix cache
+
+ 99%|█████████▉| 1013/1024 [46:32:44<33:02, 180.18s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0016996393678709865, 'learning_rate': 1e-05, 'num_tokens': 939838055.0, 'completions/mean_length': 7645.859375, 'completions/min_length': 1291.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7363.98388671875, 'completions/min_terminated_length': 1291.0, 'completions/max_terminated_length': 15282.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.36797165870666504, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020175879821181297, 'sampling/sampling_logp_difference/max': 5.202155113220215, 'sampling/importance_sampling_ratio/min': 0.005504688713699579, 'sampling/importance_sampling_ratio/mean': 1.0, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.476152748291497e-05, 'epoch': 0.93}
+
+ 99%|█████████▉| 1013/1024 [46:32:44<33:02, 180.18s/it][AINFO 12-02 18:04:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:04:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:04:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:04:01 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 99%|█████████▉| 1014/1024 [46:35:31<29:22, 176.24s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0011428170837461948, 'learning_rate': 1e-05, 'num_tokens': 940833865.0, 'completions/mean_length': 7594.515625, 'completions/min_length': 1118.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7383.568359375, 'completions/min_terminated_length': 1118.0, 'completions/max_terminated_length': 15700.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0221552737057209, 'sampling/sampling_logp_difference/max': 1.6875375509262085, 'sampling/importance_sampling_ratio/min': 0.18497444689273834, 'sampling/importance_sampling_ratio/mean': 1.0000180006027222, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.658313832806016e-05, 'epoch': 0.93}
+
+ 99%|█████████▉| 1014/1024 [46:35:31<29:22, 176.24s/it][AINFO 12-02 18:06:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:06:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:06:48 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:06:48 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+ 99%|█████████▉| 1015/1024 [46:38:43<27:08, 180.92s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0008651948301121593, 'learning_rate': 1e-05, 'num_tokens': 941880031.0, 'completions/mean_length': 8040.796875, 'completions/min_length': 1285.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7409.798828125, 'completions/min_terminated_length': 1285.0, 'completions/max_terminated_length': 15708.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2580180764198303, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018811888992786407, 'sampling/sampling_logp_difference/max': 3.8078248500823975, 'sampling/importance_sampling_ratio/min': 0.022196408361196518, 'sampling/importance_sampling_ratio/mean': 1.0000083446502686, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.908277196591371e-05, 'epoch': 0.93}
+
+ 99%|█████████▉| 1015/1024 [46:38:43<27:08, 180.92s/it][AINFO 12-02 18:10:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:10:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:10:00 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:10:00 [block_pool.py:292] Successfully reset prefix cache
+
+ 99%|█████████▉| 1016/1024 [46:41:43<24:04, 180.56s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0013348172651603818, 'learning_rate': 1e-05, 'num_tokens': 942961875.0, 'completions/mean_length': 8301.15625, 'completions/min_length': 1822.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8172.857421875, 'completions/min_terminated_length': 1822.0, 'completions/max_terminated_length': 15665.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.13098981976509094, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.021570749580860138, 'sampling/sampling_logp_difference/max': 1.8010674715042114, 'sampling/importance_sampling_ratio/min': 0.16512252390384674, 'sampling/importance_sampling_ratio/mean': 1.0000722408294678, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.9089446627494908e-05, 'epoch': 0.93}
+
+ 99%|█████████▉| 1016/1024 [46:41:43<24:04, 180.56s/it][AINFO 12-02 18:12:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:12:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:12:59 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:12:59 [block_pool.py:292] Successfully reset prefix cache
+
+ 99%|█████████▉| 1017/1024 [46:44:48<21:13, 181.90s/it][A
+                                                       [A{'loss': 0.0001, 'grad_norm': 0.001689477008767426, 'learning_rate': 1e-05, 'num_tokens': 944098458.0, 'completions/mean_length': 8742.8671875, 'completions/min_length': 1527.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.125, 'completions/mean_terminated_length': 7651.27734375, 'completions/min_terminated_length': 1527.0, 'completions/max_terminated_length': 16164.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.29432645440101624, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019856620579957962, 'sampling/sampling_logp_difference/max': 7.759915828704834, 'sampling/importance_sampling_ratio/min': 0.0004264925082679838, 'sampling/importance_sampling_ratio/mean': 1.0000076293945312, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 6.8323106461321e-05, 'epoch': 0.94}
+
+ 99%|█████████▉| 1017/1024 [46:44:48<21:13, 181.90s/it][AINFO 12-02 18:16:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:16:04 [block_pool.py:292] Successfully reset prefix cache
+
+ 99%|█████████▉| 1018/1024 [46:47:11<17:01, 170.31s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.002505479147657752, 'learning_rate': 1e-05, 'num_tokens': 944900933.0, 'completions/mean_length': 6124.8984375, 'completions/min_length': 642.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5793.95947265625, 'completions/min_terminated_length': 642.0, 'completions/max_terminated_length': 14617.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.2919674217700958, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017918679863214493, 'sampling/sampling_logp_difference/max': 2.294787883758545, 'sampling/importance_sampling_ratio/min': 0.10078276693820953, 'sampling/importance_sampling_ratio/mean': 1.0000877380371094, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.6683673215520685e-05, 'epoch': 0.94}
+
+ 99%|█████████▉| 1018/1024 [46:47:11<17:01, 170.31s/it][AINFO 12-02 18:18:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:18:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:18:28 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:18:28 [block_pool.py:292] Successfully reset prefix cache
+
+100%|█████████▉| 1019/1024 [46:50:16<14:33, 174.75s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0018793572671711445, 'learning_rate': 1e-05, 'num_tokens': 945925825.0, 'completions/mean_length': 7827.59375, 'completions/min_length': 1431.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7102.474609375, 'completions/min_terminated_length': 1431.0, 'completions/max_terminated_length': 15663.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.20411096513271332, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02043047547340393, 'sampling/sampling_logp_difference/max': 2.2534432411193848, 'sampling/importance_sampling_ratio/min': 0.10503693670034409, 'sampling/importance_sampling_ratio/mean': 0.9999958872795105, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 2.234648673038464e-05, 'epoch': 0.94}
+
+100%|█████████▉| 1019/1024 [46:50:16<14:33, 174.75s/it][AINFO 12-02 18:21:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:21:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:21:33 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:21:33 [block_pool.py:292] Successfully reset prefix cache
+
+100%|█████████▉| 1020/1024 [46:53:34<12:06, 181.60s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0006796515663154423, 'learning_rate': 1e-05, 'num_tokens': 946927697.0, 'completions/mean_length': 7688.75, 'completions/min_length': 908.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7031.12646484375, 'completions/min_terminated_length': 908.0, 'completions/max_terminated_length': 16046.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.18543371558189392, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01992659829556942, 'sampling/sampling_logp_difference/max': 5.359996795654297, 'sampling/importance_sampling_ratio/min': 0.004700921475887299, 'sampling/importance_sampling_ratio/mean': 1.000016689300537, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.511748491291655e-05, 'epoch': 0.94}
+
+100%|█████████▉| 1020/1024 [46:53:34<12:06, 181.60s/it][AINFO 12-02 18:24:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:24:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:24:50 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:24:50 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+100%|█████████▉| 1021/1024 [46:56:47<09:15, 185.20s/it][A
+                                                       [A{'loss': -0.0001, 'grad_norm': 0.0015309504233300686, 'learning_rate': 1e-05, 'num_tokens': 947968837.0, 'completions/mean_length': 7996.21875, 'completions/min_length': 940.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7207.62451171875, 'completions/min_terminated_length': 940.0, 'completions/max_terminated_length': 16142.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01925792172551155, 'sampling/sampling_logp_difference/max': 2.2097342014312744, 'sampling/importance_sampling_ratio/min': 0.10972980409860611, 'sampling/importance_sampling_ratio/mean': 1.000058650970459, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 4.545878095996159e-05, 'epoch': 0.94}
+
+100%|█████████▉| 1021/1024 [46:56:47<09:15, 185.20s/it][AINFO 12-02 18:28:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:04 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:28:04 [block_pool.py:292] Successfully reset prefix cache
+/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
+  warnings.warn(
+
+100%|█████████▉| 1022/1024 [46:59:45<06:05, 182.84s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.0009863757295534015, 'learning_rate': 1e-05, 'num_tokens': 949050730.0, 'completions/mean_length': 8313.9140625, 'completions/min_length': 1454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7775.90869140625, 'completions/min_terminated_length': 1454.0, 'completions/max_terminated_length': 15724.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018703892827033997, 'sampling/sampling_logp_difference/max': 3.7921416759490967, 'sampling/importance_sampling_ratio/min': 0.02254726178944111, 'sampling/importance_sampling_ratio/mean': 1.000022530555725, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.0245420532537537e-05, 'epoch': 0.94}
+
+100%|█████████▉| 1022/1024 [46:59:45<06:05, 182.84s/it][AINFO 12-02 18:31:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:31:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:31:01 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:31:01 [block_pool.py:292] Successfully reset prefix cache
+
+100%|█████████▉| 1023/1024 [47:02:57<03:05, 185.68s/it][A
+                                                       [A{'loss': -0.0, 'grad_norm': 0.0020384450908750296, 'learning_rate': 1e-05, 'num_tokens': 950091715.0, 'completions/mean_length': 7971.8828125, 'completions/min_length': 1150.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7181.00048828125, 'completions/min_terminated_length': 1150.0, 'completions/max_terminated_length': 16052.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.1054728776216507, 'frac_reward_zero_std': 0.75, 'sampling/sampling_logp_difference/mean': 0.020076964050531387, 'sampling/sampling_logp_difference/max': 2.144386053085327, 'sampling/importance_sampling_ratio/min': 0.11713993549346924, 'sampling/importance_sampling_ratio/mean': 0.9999734163284302, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 1.349265528460819e-05, 'epoch': 0.94}
+
+100%|█████████▉| 1023/1024 [47:02:57<03:05, 185.68s/it][AINFO 12-02 18:34:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:34:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:34:14 [block_pool.py:292] Successfully reset prefix cache
+INFO 12-02 18:34:14 [block_pool.py:292] Successfully reset prefix cache
+
+100%|██████████| 1024/1024 [47:05:58<00:00, 184.40s/it][A
+                                                       [A{'loss': 0.0, 'grad_norm': 0.001484273700043559, 'learning_rate': 1e-05, 'num_tokens': 951102084.0, 'completions/mean_length': 7732.2578125, 'completions/min_length': 1162.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6999.0595703125, 'completions/min_terminated_length': 1162.0, 'completions/max_terminated_length': 16064.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2280302792787552, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019798673689365387, 'sampling/sampling_logp_difference/max': 2.2883810997009277, 'sampling/importance_sampling_ratio/min': 0.10143054276704788, 'sampling/importance_sampling_ratio/mean': 1.0000054836273193, 'sampling/importance_sampling_ratio/max': 2.0, 'clip_ratio': 3.259437539782084e-05, 'epoch': 0.94}
+
+100%|██████████| 1024/1024 [47:05:58<00:00, 184.40s/it][A
+                                                       [A{'train_runtime': 169561.3944, 'train_samples_per_second': 0.773, 'train_steps_per_second': 0.006, 'train_loss': 9.439294444746338e-06, 'epoch': 0.94}
+
+100%|██████████| 1024/1024 [47:06:01<00:00, 184.40s/it][A[OpenTinker] 2025-12-02 18:37:17,833 - root - INFO - Training completed successfully
+[OpenTinker] 2025-12-02 18:37:17,833 - root - INFO - Training completed successfully
+100%|██████████| 1024/1024 [47:06:01<00:00, 165.59s/it]
+[OpenTinker] 2025-12-02 18:37:17,837 - root - INFO - Training completed successfully
+[OpenTinker] 2025-12-02 18:37:17,837 - root - INFO - Training completed successfully
+[OpenTinker] 2025-12-02 18:37:18,819 - root - INFO - Model saved to outputs/dr_grpo_lora_20251130_192918
+[OpenTinker] 2025-12-02 18:37:18,863 - root - INFO - Model saved to outputs/dr_grpo_lora_20251130_192918
+[OpenTinker] 2025-12-02 18:37:18,870 - root - INFO - Model saved to outputs/dr_grpo_lora_20251130_192918
+[OpenTinker] 2025-12-02 18:37:19,253 - root - INFO - Model saved to outputs/dr_grpo_lora_20251130_192918
+[1;34mwandb[0m: 
+[1;34mwandb[0m: 🚀 View run [33moutputs/dr_grpo_lora_20251130_192918[0m at: [34m[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251130_193013-axfzdypj/logs[0m
+[1;34mwandb[0m: 
+[1;34mwandb[0m: 🚀 View run [33moutputs/dr_grpo_lora_20251130_192918[0m at: [34m[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251130_193013-hblruoay/logs[0m
+[1;34mwandb[0m: 
+[1;34mwandb[0m: 🚀 View run [33moutputs/dr_grpo_lora_20251130_192918[0m at: [34m[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251130_193013-56oyy2tp/logs[0m
+[1;34mwandb[0m: 
+[1;34mwandb[0m: 🚀 View run [33moutputs/dr_grpo_lora_20251130_192918[0m at: [34m[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251130_193013-8qozoeij/logs[0m
+lshn-qs-e9wz-2:398356:459871 [2] NCCL INFO comm 0x1a4553e0 rank 2 nranks 4 cudaDev 2 busId a2000 - Abort COMPLETE
+lshn-qs-e9wz-2:398357:459873 [3] NCCL INFO comm 0x1b723300 rank 3 nranks 4 cudaDev 3 busId c6000 - Abort COMPLETE
+[rank2]:[W1202 18:37:23.890199198 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank3]:[W1202 18:37:23.910495244 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+lshn-qs-e9wz-2:398355:459875 [1] NCCL INFO comm 0x1a22d200 rank 1 nranks 4 cudaDev 1 busId 7e000 - Abort COMPLETE
+[rank1]:[W1202 18:37:23.970837963 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[rank0]:[W1202 18:37:24.633573492 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+lshn-qs-e9wz-2:398356:459878 [2] NCCL INFO comm 0x1a55cff0 rank 0 nranks 1 cudaDev 2 busId a2000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:459882 [1] NCCL INFO comm 0x1a3418c0 rank 0 nranks 1 cudaDev 1 busId 7e000 - Abort COMPLETE
+lshn-qs-e9wz-2:398357:459880 [3] NCCL INFO comm 0x1b837da0 rank 0 nranks 1 cudaDev 3 busId c6000 - Abort COMPLETE
+lshn-qs-e9wz-2:398356:459886 [2] NCCL INFO comm 0x1bc4c240 rank 0 nranks 1 cudaDev 2 busId a2000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:459888 [1] NCCL INFO comm 0x1ba8b5c0 rank 0 nranks 1 cudaDev 1 busId 7e000 - Abort COMPLETE
+lshn-qs-e9wz-2:398357:459890 [3] NCCL INFO comm 0x1cf940f0 rank 0 nranks 1 cudaDev 3 busId c6000 - Abort COMPLETE
+lshn-qs-e9wz-2:398354:459884 [0] NCCL INFO comm 0x1a2a9f20 rank 0 nranks 4 cudaDev 0 busId 8000 - Abort COMPLETE
+lshn-qs-e9wz-2:398357:459896 [3] NCCL INFO comm 0x1d09bd00 rank 0 nranks 1 cudaDev 3 busId c6000 - Abort COMPLETE
+lshn-qs-e9wz-2:398356:459892 [2] NCCL INFO comm 0x1bd53e50 rank 0 nranks 1 cudaDev 2 busId a2000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:459894 [1] NCCL INFO comm 0x1bb931d0 rank 0 nranks 1 cudaDev 1 busId 7e000 - Abort COMPLETE
+lshn-qs-e9wz-2:398354:459898 [0] NCCL INFO comm 0x1a3beb50 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE
+lshn-qs-e9wz-2:398356:459902 [2] NCCL INFO comm 0x1be5ba60 rank 0 nranks 1 cudaDev 2 busId a2000 - Abort COMPLETE
+lshn-qs-e9wz-2:398357:459900 [3] NCCL INFO comm 0x1d1a3910 rank 0 nranks 1 cudaDev 3 busId c6000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:459904 [1] NCCL INFO comm 0x1bc9ade0 rank 0 nranks 1 cudaDev 1 busId 7e000 - Abort COMPLETE
+lshn-qs-e9wz-2:398354:459906 [0] NCCL INFO comm 0x1bb1b890 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE
+lshn-qs-e9wz-2:398356:459908 [2] NCCL INFO comm 0x1bf63670 rank 0 nranks 1 cudaDev 2 busId a2000 - Abort COMPLETE
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398356:399458 [2] NCCL INFO misc/socket.cc:915 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398357:459910 [3] NCCL INFO comm 0x1d2ab520 rank 0 nranks 1 cudaDev 3 busId c6000 - Abort COMPLETE
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398356:399458 [2] NCCL INFO misc/socket.cc:915 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398357:399452 [3] NCCL INFO misc/socket.cc:915 -> 3
+lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO comm 0x191e30a0 rank 2 nranks 4 cudaDev 2 busId a2000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:459912 [1] NCCL INFO comm 0x1bda29f0 rank 0 nranks 1 cudaDev 1 busId 7e000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398355:399454 [1] NCCL INFO misc/socket.cc:915 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398354:459914 [0] NCCL INFO comm 0x1bc234a0 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE
+lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO comm 0x1a5a5680 rank 3 nranks 4 cudaDev 3 busId c6000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO comm 0x19032270 rank 1 nranks 4 cudaDev 1 busId 7e000 - Abort COMPLETE
+lshn-qs-e9wz-2:398355:400037 [1] NCCL INFO [Service thread] Connection closed by localRank 2
+lshn-qs-e9wz-2:398357:400041 [3] NCCL INFO [Service thread] Connection closed by localRank 2
+lshn-qs-e9wz-2:398354:459922 [0] NCCL INFO comm 0x1bd2b0b0 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE
+lshn-qs-e9wz-2:398354:400039 [0] NCCL INFO [Service thread] Connection closed by localRank 3
+lshn-qs-e9wz-2:398354:459925 [0] NCCL INFO comm 0x1be32cc0 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398354:399456 [0] NCCL INFO misc/socket.cc:915 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:64 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:81 -> 3
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:863 -> 3
+lshn-qs-e9wz-2:398354:400039 [0] NCCL INFO [Service thread] Connection closed by localRank 1
+lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO comm 0x18ac50d0 rank 0 nranks 4 cudaDev 0 busId 8000 - Abort COMPLETE
diff --git a/grpo_lora_20251130_192918/special_tokens_map.json b/grpo_lora_20251130_192918/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1
--- /dev/null
+++ b/grpo_lora_20251130_192918/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/grpo_lora_20251130_192918/tokenizer_config.json b/grpo_lora_20251130_192918/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9
--- /dev/null
+++ b/grpo_lora_20251130_192918/tokenizer_config.json
@@ -0,0 +1,194 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}